1 
2 /******************************************************************************
3 * MODULE     : parsehtml.cpp
4 * DESCRIPTION: conversion of xml and html strings into logical html trees
5 * COPYRIGHT  : (C) 2000  Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11 
12 #include "convert.hpp"
13 #include "hashset.hpp"
14 #include "converter.hpp"
15 #include "parse_string.hpp"
16 
17 /******************************************************************************
18 * The xml/html parser aims to parse a superset of the set of valid documents.
19 * In other words, no attempts are made to signal error messages for
20 * incorrect documents; in the case of Html we even attempt to correct
21 * common mistakes, like badly structured documents. So correct documents
22 * should be parsed correctly and incorrect documents are transformed into
23 * correct documents in a heuristic way.
24 *
25 * The parser proceeds in three steps: the first pass does all parsing
26 * except for the construction of a tree structure for nested tags.
27 * The second stage takes care of the nesting, while heuristically
28 * correcting improper nested trees, and while taking care of optional
29 * closing tags in the case of Html. The last stage does some final
30 * white space and entity cleanup.
31 *
32 * Present limitations: we do not fully parse <!DOCTYPE ...> constructs yet.
33 * Entities which are present in the DOCTYPE definition of the document
34 * will be expanded. However, external DTD's are not read. Notice also that
35 * it is not yet possible to associate default xml:space attributes to tags.
36 ******************************************************************************/
37 
38 struct xml_html_parser {
39   bool html;
40   parse_string s;
41   hashmap<string,string> entities;
42   array<tree> a;
43   int i, n;
44   tree stack;
45 
46   xml_html_parser ();
skip_spacexml_html_parser47   inline void skip_space () {
48     while (s && is_space (s[0])) s += 1; }
is_name_charxml_html_parser49   inline bool is_name_char (char c) {
50     return is_alpha (c) || is_digit (c) ||
51       (c == '_') || (c == ':') || (c == '.') || (c == '-') ||
52       (((int) ((unsigned char) c)) >= 128); }
53 
54   string transcode (string s);
55 
56   string parse_until (string what);
57   string parse_name ();
58   string parse_quoted ();
59   string expand_entity (string s);
60   string expand_entities (string s);
61   string parse_entity ();
62   tree parse_attribute ();
63   tree parse_opening ();
64   tree parse_closing ();
65   tree parse_pi ();
66   tree parse_comment ();
67   tree parse_cdata ();
68   tree parse_misc ();
69   void parse ();
70 
71   tree parse_system ();
72   tree parse_public ();
73   tree parse_element ();
74   tree parse_attlist ();
75   void parse_entity_decl ();
76   tree parse_notation ();
77   tree parse_doctype ();
78 
79   // NOTE: these routines should remain there even if they are not used
80   bool finalize_preserve_space (string tag);
81   string finalize_space (string s, bool first, bool last);
82   tree finalize_space (tree t);
83   // END NOTE
84   bool build_valid_child (string parent, string child);
85   bool build_must_close (string tag);
86   bool build_can_close (string tag);
87   void build (tree& r);
88 
89   tree finalize_sxml (tree t);
90   tree parse (string s);
91 };
92 
93 /******************************************************************************
94 * Initialization
95 ******************************************************************************/
96 
97 static hashset<string> html_empty_tag_table;
98 static hashset<string> html_auto_close_table;
99 static hashset<string> html_block_table;
100 static hashmap<string,string> html_entity ("");
101 
load_html_entities(hashmap<string,string> table,string fname)102 void load_html_entities (hashmap<string, string> table, string fname) {
103   string s;
104   if (DEBUG_VERBOSE) debug_convert << "Loading " << fname << "\n";
105   if (load_string (url ("$TEXMACS_PATH/langs/encoding", fname), s, false)) return;
106   tree t= block_to_scheme_tree (s);
107   if (!is_tuple (t)) return;
108 
109   int i, n= N(t);
110   for (i=0; i<n; i++)
111     if (is_func (t[i], TUPLE, 2) &&
112 	is_atomic (t[i][0]) && is_atomic (t[i][1]))
113       {
114 	string l= t[i][0]->label; if (is_quoted (l)) l= scm_unquote (l);
115 	string r= t[i][1]->label; if (is_quoted (r)) r= scm_unquote (r);
116 	table (l)= r;
117       }
118 }
119 
xml_html_parser()120 xml_html_parser::xml_html_parser (): entities ("") {
121   if (N(html_empty_tag_table) == 0) {
122     html_empty_tag_table->insert ("basefont");
123     html_empty_tag_table->insert ("br");
124     html_empty_tag_table->insert ("area");
125     html_empty_tag_table->insert ("link");
126     html_empty_tag_table->insert ("param");
127     html_empty_tag_table->insert ("hr");
128     html_empty_tag_table->insert ("input");
129     html_empty_tag_table->insert ("col");
130     html_empty_tag_table->insert ("frame");
131     html_empty_tag_table->insert ("isindex");
132     html_empty_tag_table->insert ("base");
133     html_empty_tag_table->insert ("meta");
134     html_empty_tag_table->insert ("img");
135   }
136 
137   if (N(html_auto_close_table) == 0) {
138     html_auto_close_table->insert ("body");
139     html_auto_close_table->insert ("p");
140     html_auto_close_table->insert ("dt");
141     html_auto_close_table->insert ("dd");
142     html_auto_close_table->insert ("li");
143     html_auto_close_table->insert ("option");
144     html_auto_close_table->insert ("thead");
145     html_auto_close_table->insert ("tfoot");
146     html_auto_close_table->insert ("tbody");
147     html_auto_close_table->insert ("colgroup");
148     html_auto_close_table->insert ("tr");
149     html_auto_close_table->insert ("th");
150     html_auto_close_table->insert ("td");
151     html_auto_close_table->insert ("head");
152     html_auto_close_table->insert ("html");
153   }
154 
155   if (N(html_block_table) == 0) {
156     html_block_table->insert ("h1");
157     html_block_table->insert ("h2");
158     html_block_table->insert ("h3");
159     html_block_table->insert ("h4");
160     html_block_table->insert ("h5");
161     html_block_table->insert ("h6");
162     html_block_table->insert ("ul");
163     html_block_table->insert ("ol");
164     html_block_table->insert ("li");
165     html_block_table->insert ("dl");
166     html_block_table->insert ("dd");
167     html_block_table->insert ("dt");
168     html_block_table->insert ("pre");
169     html_block_table->insert ("div");
170     html_block_table->insert ("p");
171     html_block_table->insert ("noscript");
172     html_block_table->insert ("blockquote");
173     html_block_table->insert ("form");
174     html_block_table->insert ("hr");
175     html_block_table->insert ("table");
176     html_block_table->insert ("fieldset");
177     html_block_table->insert ("address");
178   }
179 
180   if (N (html_entity) == 0) {
181     load_html_entities (html_entity, "HTMLlat1.scm");
182     load_html_entities (html_entity, "HTMLspecial.scm");
183     load_html_entities (html_entity, "HTMLsymbol.scm");
184   }
185 }
186 
187 /******************************************************************************
188 * Transcoding input to UTF-8
189 ******************************************************************************/
190 
191 // TODO: support BOM and other bells and whistles
192 // http://www.w3.org/TR/REC-xml#sec-guessing
193 
194 // TODO: support HTML http-equiv Content-Type
195 // http://www.w3.org/TR/html4/charset.html#h-5.2.2
196 
197 // Currently, the input encoding is expected to be ASCII-compatible.
198 // If no <?xml?> prolog is found, the encoding is assumed to be UTF-8 or
199 // ISO-8859-1 if iconv cannot perform an utf8->utf8 conversion.
200 
201 string
transcode(string s2)202 xml_html_parser::transcode (string s2) {
203   s= parse_string (s2);
204 
205   string encoding;
206   if (test (s, "<?")) {
207     s += 2;
208     string target= parse_name ();
209     skip_space ();
210     if (target == "xml") {
211       // since html==true implies we can accept horribly broken HTML, the
212       // presence of an XML prolog is not enough to clear the flag.
213       /* html= false; */
214       while (s && !test (s, "?>")) {
215 	string attname= parse_name ();
216 	skip_space ();
217 	if (!test (s, "=")) break;
218 	s += 1;
219 	skip_space ();
220 	string val;
221 	if (test (s, "\"")) {
222 	  s += 1;
223 	  val= parse_until ("\"");
224 	  skip_space ();
225 	}
226 	else if (test (s, "'")) {
227 	  s += 1;
228 	  val= parse_until ("'");
229 	  skip_space ();
230 	}
231 	if (attname == "encoding") {
232 	  encoding= upcase_all (val);
233 	  break;
234 	}
235       }
236     }
237   }
238 
239   if (N(encoding) != 0) {
240     // cout << "encoding was specified\n" ;
241     string s3= convert (s2, encoding, "UTF-8");
242     if (N(s3) == 0)
243       /* conversion from specified charset failed, do nothing (and pray) */ ;
244     else return s3;
245   }
246   else {
247     // cout << "guess encoding\n" ;
248     if (check_encoding (s2, "UTF-8"))
249       /* input encoding seems to be utf-8, do nothing */ ;
250     else {
251       string s3= convert (s2, "ISO-8859-1", "UTF-8");
252       if (N(s3) != 0) return s3;
253     }
254   }
255 
256   return s2;
257 }
258 
259 /******************************************************************************
260 * Parsing without structuring
261 ******************************************************************************/
262 
263 string
parse_until(string what)264 xml_html_parser::parse_until (string what) {
265   string r;
266   while (s && !test (s, what)) r << s->read (1);
267   if (test (s, what)) s += N(what);
268   return expand_entities (r);
269 }
270 
271 string
parse_name()272 xml_html_parser::parse_name () {
273   string r;
274   while (s && is_name_char (s[0])) r << s->read (1);
275   if (html) return locase_all (r);
276   return expand_entities (r);
277 }
278 
279 string
expand_entity(string s)280 xml_html_parser::expand_entity (string s) {
281   if (entities->contains (s)) return entities[s];
282   else if (s[0] == '&') {
283     if (N(s)>1 && s[1] == '#') {
284       int i= 2;
285       bool okay= false;
286       string r= convert_char_entity (s, i, okay);
287       if (okay) return r;
288       return s;
289     }
290     else if (html) {
291       string ss= s (1, s [N(s)-1] == ';' ? N(s)-1 : N(s));
292       if (html_entity->contains (ss))
293 	// HTML entity references expand to character references
294 	// so they need to be finalized a second time.
295 	return expand_entity (html_entity [ss]);
296     }
297   }
298   return s;
299 }
300 
301 string
expand_entities(string s)302 xml_html_parser::expand_entities (string s) {
303   string r;
304   int i, n= N(s);
305   for (i=0; i<n; ) {
306     if (s[i] == '&' || s[i] == '%') {
307       int start= i++;
308       if (i<n && s[i] == '#') {
309 	i++;
310 	if (i<n && (s[i] == 'x' || s[i] == 'X')) {
311 	  i++;
312 	  while (i<n && is_hex_digit (s[i])) i++;
313 	}
314 	else while (i<n && is_digit (s[i])) i++;
315       }
316       else while (i<n && is_name_char (s[i])) i++;
317       if (i<n && s[i] == ';') i++;
318       r << expand_entity (s (start, i));
319     }
320     else r << s[i++];
321   }
322   if (r == s) return r;
323   return expand_entities (r);
324 }
325 
326 string
parse_entity()327 xml_html_parser::parse_entity () {
328   string r= s->read (1);
329   if (test (s, "#")) {
330     r << s->read (1);
331     if (test (s, "x") || test (s, "X")) {
332       r << s->read (1);
333       while (s && is_hex_digit (s[0])) r << s->read (1);
334     }
335     else while (s && is_digit (s[0])) r << s->read (1);
336   }
337   else while (s && is_name_char (s[0])) r << s->read (1);
338   if (test (s, ";")) r << s->read (1);
339   string x= expand_entity (r);
340   if (x == r || r == "&lt;" || r == "&amp;") return x;
341   s->write (x);
342   return "";
343 }
344 
345 string
parse_quoted()346 xml_html_parser::parse_quoted () {
347   if (test (s, "\42")) {
348     s += 1;
349     return parse_until ("\42");
350   }
351   if (test (s, "'")) {
352     s += 1;
353     return parse_until ("'");
354   }
355   return "";
356 }
357 
358 tree
parse_attribute()359 xml_html_parser::parse_attribute () {
360   string attr= parse_name (), val;
361   bool no_val= false;
362   skip_space ();
363   if (test (s, "=")) s += 1;
364   skip_space ();
365   if (test (s, "\42") || test (s, "'"))
366     val= parse_quoted ();
367   else { // for Html
368     string r;
369     while (s) {
370       if (is_space (s[0]) || (s[0]=='<') || (s[0]=='>')) break;
371       r << s->read (1);
372     }
373     val   = r;
374     no_val= N(r) == 0;
375   }
376   if (!no_val) return tuple ("attr", attr, val);
377   else if (attr != "") return tuple ("attr", attr);
378   else return tuple ("attr");
379 }
380 
381 tree
parse_opening()382 xml_html_parser::parse_opening () {
383   s += 1;
384   string name= parse_name ();
385   tree t= tuple ("begin", name);
386   while (true) {
387     skip_space ();
388     if (!s || s[0] == '>' || test (s, "/>")) break;
389     tree attr= parse_attribute ();
390     if (attr == tuple ("attr")) break;
391     t << attr;
392   }
393   if (test (s, "/>")) { t[0]= "tag"; s += 2; }
394   else if (test (s, ">")) s += 1;
395   return t;
396 }
397 
398 tree
parse_closing()399 xml_html_parser::parse_closing () {
400   s += 2;
401   string name= parse_name ();
402   (void) parse_until (">");
403   return tuple ("end", name);
404 }
405 
406 tree
parse_pi()407 xml_html_parser::parse_pi () {
408   s += 2;
409   string name= parse_name ();
410   skip_space ();
411   return tuple ("pi", name, parse_until ("?>"));
412 }
413 
414 tree
parse_comment()415 xml_html_parser::parse_comment () {
416   s += 4;
417   return tuple ("comment", parse_until ("-->"));
418 }
419 
420 tree
parse_cdata()421 xml_html_parser::parse_cdata () {
422   s += 9;
423   return tuple ("cdata", parse_until ("]]>"));
424 }
425 
426 tree
parse_misc()427 xml_html_parser::parse_misc () {
428   s += 2;
429   tree t= tuple ("misc");
430   while (true) {
431     skip_space ();
432     if (test (s, ">")) { s += 1; break; }
433     string r;
434     while (s) {
435       if (is_space (s[0]) || (s[0] == '>')) break;
436       r << s->read (1);
437     }
438     t << r;
439   }
440   return t;
441 }
442 
443 void
parse()444 xml_html_parser::parse () {
445   string r;
446   while (s) {
447     if (s[0] == '<') {
448       if (N(r) != 0) { a << tree (r); }
449       if (test (s, "</")) a << parse_closing ();
450       else if (test (s, "<?")) a << parse_pi ();
451       else if (test (s, "<!--")) a << parse_comment ();
452       else if (test (s, "<![CDATA[")) a << parse_cdata ();
453       else if (test (s, "<!DOCTYPE")) a << parse_doctype ();
454       else if (test (s, "<!")) a << parse_misc ();
455       else a << parse_opening ();
456       r= "";
457     }
458     else if (s[0] == '&') r << parse_entity ();
459     else r << s->read (1);
460   }
461   if (N(r) != 0) a << tree (r);
462 }
463 
464 /******************************************************************************
465 * Parsing the document type
466 ******************************************************************************/
467 
468 tree
parse_system()469 xml_html_parser::parse_system () {
470   s += 6;
471   tree st= tuple ("system");
472   skip_space ();
473   st << parse_quoted ();
474   return st;
475 }
476 
477 tree
parse_public()478 xml_html_parser::parse_public () {
479   s += 6;
480   tree st= tuple ("public");
481   skip_space ();
482   st << parse_quoted ();
483   skip_space ();
484   st << parse_quoted ();
485   return st;
486 }
487 
488 tree
parse_element()489 xml_html_parser::parse_element () {
490   s += 9;
491   return tuple ("element", parse_until (">"));
492 }
493 
494 tree
parse_attlist()495 xml_html_parser::parse_attlist () {
496   s += 9;
497   return tuple ("attlist", parse_until (">"));
498 }
499 
500 void
parse_entity_decl()501 xml_html_parser::parse_entity_decl () {
502   s += 8;
503   skip_space ();
504   bool parameter= test (s, "%");
505   if (parameter) { s += 1; skip_space (); }
506   string name= parse_name ();
507   if (parameter) name= "%" * name * ";";
508   else name= "&" * name * ";";
509   skip_space ();
510 
511   if (test (s, "SYSTEM") || test (s, "PUBLIC")) {
512     // TODO: allow for loading of external entities using wget
513     if (test (s, "SYSTEM")) (void) parse_system ();
514     else (void) parse_public ();
515     skip_space ();
516     if (test (s, "NDATA")) {
517       s += 5;
518       skip_space ();
519       (void) parse_name ();
520     }
521   }
522   else {
523     string val= parse_quoted ();
524     val= expand_entities (val);
525     entities (name) = val;
526     // cout << name << " := " << val << "\n";
527   }
528 
529   skip_space ();
530   if (test (s, ">")) s += 1;
531 }
532 
533 tree
parse_notation()534 xml_html_parser::parse_notation () {
535   s += 10;
536   return tuple ("notation", parse_until (">"));
537 }
538 
539 tree
parse_doctype()540 xml_html_parser::parse_doctype () {
541   s += 9;
542   tree dt= tuple ("doctype");
543   skip_space ();
544   dt << parse_name ();
545   skip_space ();
546   if (test (s, "SYSTEM")) dt << parse_system ();
547   else if (test (s, "PUBLIC")) dt << parse_public ();
548   skip_space ();
549 
550   if (test (s, "[")) {
551     s += 1;
552     while (s) {
553       skip_space ();
554       if (test (s, "]")) { s += 1; break; }
555       else if (test (s, "<!ELEMENT")) dt << parse_element ();
556       else if (test (s, "<!ATTLIST")) dt << parse_cdata ();
557       else if (test (s, "<!ENTITY")) parse_entity_decl ();
558       else if (test (s, "<!NOTATION")) a << parse_notation ();
559       else if (test (s, "<?")) dt << parse_pi ();
560       else if (test (s, "<!--")) dt << parse_comment ();
561       else if (s[0] == '&' || s[0] == '%') (void) parse_entity ();
562       else s += 1;
563     }
564   }
565 
566   skip_space ();
567   if (test (s, ">")) s += 1;
568   return dt;
569 }
570 
571 /******************************************************************************
572 * Building the structured parse tree with error correction
573 ******************************************************************************/
574 
575 bool
build_valid_child(string parent,string child)576 xml_html_parser::build_valid_child (string parent, string child) {
577   if (!html) return true;
578   if ((parent == "<bottom>") || (parent == "html") || (parent == "body"))
579     return true;
580   if (html_empty_tag_table->contains (parent)) return false;
581   if (!html_auto_close_table->contains (child)) return true;
582   if (parent == "p") return !html_block_table->contains (child);
583   if ((child == "dt") || (child == "dd")) return parent == "dl";
584   if (child == "li")
585     return (parent == "ul") || (parent == "ol") ||
586            (parent == "dir") || (parent == "menu");
587   if (child == "option") return (parent == "select") || (parent == "optgroup");
588   if ((child == "thead") || (child == "tfoot") || (child == "tbody"))
589     return parent == "table";
590   if (child == "colgroup") return parent == "table";
591   if (child == "col") return (parent == "table") || (parent == "colgroup");
592   if (child == "tr")
593     return (parent == "table") || (parent == "thead") ||
594            (parent == "tfoot") || (parent == "tbody");
595   if ((child == "th") || (child == "td"))
596     return (parent == "tr") ||
597            (parent == "table") || (parent == "thead") ||
598            (parent == "tfoot") || (parent == "tbody");
599   return true;
600 }
601 
602 bool
build_must_close(string tag)603 xml_html_parser::build_must_close (string tag) {
604   if (build_valid_child (stack[0]->label, tag)) return false;
605   // if !html, we have already returned false
606   tree counter= stack;
607   while (counter != tuple ("<bottom>")) {
608     if (build_valid_child (counter[0]->label, tag)) return true;
609     counter= counter[1];
610   }
611   // since <html> and <body> can have any child we only get here when parsing
612   // something where both are omitted and we can close nodes up to the root.
613   return true;
614 }
615 
616 bool
build_can_close(string tag)617 xml_html_parser::build_can_close (string tag) {
618   if (N(stack) < 2) return false;
619   tree counter= stack[1];
620   while (counter != tuple ("<bottom>")) {
621     if (counter[0]->label == tag) return true;
622     counter= counter[1];
623   }
624   return false;
625 }
626 
627 void
build(tree & r)628 xml_html_parser::build (tree& r) {
629   while (i<n) {
630     if (is_tuple (a[i], "begin")) {
631       string name= a[i][1]->label;
632       if (build_must_close (name)) return;
633       tree sub= copy (a[i]); sub[0]= "tag";
634       i++;
635       if (html && html_empty_tag_table->contains (name))
636 	r << sub;
637       else {
638 	stack= tuple (name, stack);
639 	build (sub);
640 	r << sub;
641 	stack= stack[1];
642       }
643     }
644     else if (is_tuple (a[i], "end")) {
645       if (stack[0]->label == a[i][1]->label) { i++; return; }
646       if (build_can_close (a[i][1]->label)) return;
647       i++;
648     }
649     else r << a[i++];
650   }
651 }
652 
653 /******************************************************************************
654 * Finalization
655 ******************************************************************************/
656 
657 bool
finalize_preserve_space(string tag)658 xml_html_parser::finalize_preserve_space (string tag) {
659   return tag == "pre";
660 }
661 
662 string
finalize_space(string s,bool first,bool last)663 xml_html_parser::finalize_space (string s, bool first, bool last) {
664   int i, n= N(s);
665   string r;
666   bool flag= first;
667   for (i=0; i<n; i++)
668     if (is_space (s[i])) {
669       if (!flag) r << ' ';
670       flag= true;
671     }
672     else {
673       r << s[i];
674       flag= false;
675     }
676   n= N(r);
677   if (last && (n>0) && (r[n-1] == ' '))
678     r->resize (n-1);
679   return r;
680 }
681 
682 tree
finalize_space(tree t)683 xml_html_parser::finalize_space (tree t) {
684   if (is_atomic (t) || (!is_tuple (t, "tag"))) return t;
685   else {
686     int i, n= N(t);
687     tree r= tuple (t[0], t[1]);
688     int first= -1, last= -1;
689     for (i=2; i<n; i++)
690       if (!is_tuple (t[i], "attr")) {
691 	first= i; break;
692       }
693     if (!is_tuple (t[n-1], "attr"))
694       last= n-1;
695     (void) first; (void) last;
696     for (i=2; i<n; i++) {
697       if (is_atomic (t[i])) {
698 	if (finalize_preserve_space (t[1]->label)) r << t[i];
699 	else {
700 	  string s= finalize_space (t[i]->label, i==2, i==(n-1));
701 	  if (s != "") r << s;
702 	}
703       }
704       else if (is_tuple (t[i], "tag")) r << finalize_space (t[i]);
705       else r << t[i];
706     }
707     return r;
708   }
709 }
710 
711 static string
simple_quote(string s)712 simple_quote (string s) {
713   return "\"" * s * "\"";
714 }
715 
716 tree
finalize_sxml(tree t)717 xml_html_parser::finalize_sxml (tree t) {
718   if (!is_tuple (t, "tag")) return ""; // sanity
719   int i, n= N(t);
720   tree tag = tuple (t[1]);
721   if (t[1] == "<document>") tag= tuple ("*TOP*");
722   tree attrs = tuple ("@");
723   tree content = tuple ();
724   for (i=2; i<n; i++)
725     if (is_tuple (t[i], "attr")) {
726       tree attr;
727       if (N(t[i]) == 2) attr= tuple (t[i][1]);
728       else attr= tuple (t[i][1]->label, simple_quote (t[i][2]->label));
729       attrs << attr;
730     }
731     else if (is_tuple (t[i], "tag"))
732       content << finalize_sxml (t[i]);
733     else if (is_atomic (t[i]))
734       content << simple_quote (t[i]->label);
735     else if (is_tuple (t[i], "pi"))
736       content << tuple ("*PI*", t[i][1]->label, simple_quote (t[i][2]->label));
737     else if (is_tuple (t[i], "doctype"))
738       // TODO: convert DTD declarations
739       content << tuple ("*DOCTYPE*", simple_quote (t[i][1]->label));
740     else if (is_tuple (t[i], "cdata"))
741       content << simple_quote (t[i][1]->label);
742   if (N(attrs) > 1) tag << attrs;
743   tag << A(content);
744   return tag;
745 }
746 
747 /******************************************************************************
748 * Building the structured parse tree with error correction
749 ******************************************************************************/
750 
751 tree
parse(string s2)752 xml_html_parser::parse (string s2) {
753   // end of line handling
754   string s3;
755   i= 0, n= N(s2);
756   bool is_cr= false;
757   while (i<n) {
758     bool prev_is_cr= is_cr;
759     is_cr= false;
760     char c= s2[i];
761     if (c == '\15') {
762       s3 << '\12';
763       is_cr= true;
764     }
765     else if (prev_is_cr && (c == '\12')) /* no-op */;
766     else s3 << c;
767     i++;
768   }
769   s2= s3;
770 
771   // cout << "Transcoding " << s2 << "\n";
772   if (html) s2= transcode (s2);
773   // cout << HRULE << LF;
774   s= parse_string (s2);
775   //cout << "Parsing " << s << "\n";
776   parse ();
777   // cout << HRULE << LF;
778   // cout << "a= " << a << "\n";
779   i= 0; n= N(a); stack= tuple ("<bottom>");
780   tree r= tuple ("tag", "<document>");
781   build (r);
782   // cout << HRULE << LF;
783   // print_tree (r);
784   r= finalize_sxml (r);
785   // cout << HRULE << LF;
786   // print_tree (r);
787   return r;
788 }
789 
790 /******************************************************************************
791 * Interface
792 ******************************************************************************/
793 
794 tree
parse_xml(string s)795 parse_xml (string s) {
796   xml_html_parser parser;
797   parser.html= false;
798   tree t= parser.parse (s);
799   return t;
800 }
801 
802 tree
parse_html(string s)803 parse_html (string s) {
804   xml_html_parser parser;
805   parser.html= true;
806   tree t= parser.parse (s);
807   return t;
808 }
809