1 /* retawq/parser.c - content parsing (HTML, ...)
2    This file is part of retawq (<http://retawq.sourceforge.net/>), a network
3    client created by Arne Thomassen; retawq is basically released under certain
4    versions of the GNU General Public License and WITHOUT ANY WARRANTY.
5    Read the file COPYING for license details, README for program information.
6    Copyright (C) 2001-2005 Arne Thomassen <arne@arne-thomassen.de>
7 */
8 
9 #include "stuff.h"
10 #include "parser.h"
11 
12 declare_local_i18n_buffer
13 #if CONFIG_DEBUG
14 static const_after_init int fd_parsertest;
15 static char debugstrbuf[STRBUF_SIZE];
16 #define prsdbg(msg) my_write_str(fd_parsertest, msg)
17 #endif
18 
19 static const char strCommentTag[] = "!--", strSingleQuote[] = "'",
20   strPipe[] = "|", strHref[] = "href", strType[] = "type",
21   strName[] = "name", strValue[] = "value", strDisabled[] = "disabled",
22   strTitle[] = "title", strStyle[] = "style", strReadonly[] = "readonly",
23   strAlt[] = "alt", strLabel[] = "label", strSize[] = "size",
24   strCenter[] = "center", strP[] = "p", strPi[] = "pi", strOtimes[] = "otimes";
25 #define strTimes (strOtimes + 1)
26 
27 static const char strAcute[] = "acute", strCedil[] = "cedil",
28   strCirc[] = "circ", strTilde[] = "tilde", strUml[] = "uml", strSup[] = "sup";
29 
30 my_enum1 enum
31 { htfNone = 0, htfRequireEndtag = 0x01, htfAllowEndtag = 0x02,
32   htfForbidEndtag = 0x04, htfForbidPre = 0x08, htfBlock = 0x10,
33   htfPar = 0x20, htfSoakUpText = 0x40
34 } my_enum2(unsigned char) tHtmlTagFlags;
35 
36 /* begin-autogenerated */
37 #define TAGOFFSET (1)
38 #define NUM_TAGDATA (66)
39 static const struct
40 { const char* name; /* (sorted in alphabetical order) */
41   tHtmlTagFlags flags;
42 } tagdata[NUM_TAGDATA] =
43 { { strA, htfRequireEndtag },
44   { "address", htfRequireEndtag | htfBlock | htfPar },
45   { "area", htfForbidEndtag },
46   { "b", htfRequireEndtag },
47   { "big", htfRequireEndtag },
48   { "blockquote", htfRequireEndtag | htfBlock | htfPar },
49   { "body", htfAllowEndtag },
50   { "br", htfForbidEndtag },
51   { strButton, htfRequireEndtag },
52   { "caption", htfRequireEndtag | htfPar },
53   { strCenter, htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
54   { "cite", htfRequireEndtag },
55   { "dd", htfAllowEndtag | htfBlock },
56   { "del", htfRequireEndtag },
57   { "dfn", htfRequireEndtag },
58   { "dir", htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
59   { "div", htfRequireEndtag | htfBlock | htfPar },
60   { "dl", htfRequireEndtag | htfBlock | htfPar },
61   { "dt", htfAllowEndtag | htfBlock },
62   { "em", htfRequireEndtag },
63   { "fieldset", htfRequireEndtag | htfBlock | htfPar },
64   { "font", htfRequireEndtag }, /* deprecated */
65   { "form", htfRequireEndtag | htfBlock | htfPar },
66   { "frame", htfForbidEndtag | htfBlock },
67   { "frameset", htfRequireEndtag | htfBlock | htfPar },
68   { "h1", htfRequireEndtag | htfBlock | htfPar },
69   { "h2", htfRequireEndtag | htfBlock | htfPar },
70   { "h3", htfRequireEndtag | htfBlock | htfPar },
71   { "h4", htfRequireEndtag | htfBlock | htfPar },
72   { "h5", htfRequireEndtag | htfBlock | htfPar },
73   { "h6", htfRequireEndtag | htfBlock | htfPar },
74   { "head", htfAllowEndtag | htfForbidPre },
75   { "hr", htfForbidEndtag | htfBlock },
76   { strHtml, htfAllowEndtag },
77   { "i", htfRequireEndtag },
78   { "iframe", htfForbidEndtag | htfBlock },
79   { "img", htfForbidEndtag },
80   { "input", htfForbidEndtag },
81   { "li", htfAllowEndtag | htfBlock },
82   { "menu", htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
83   { "meta", htfForbidEndtag },
84   { "noframes", htfRequireEndtag | htfBlock | htfPar },
85   { "noscript", htfRequireEndtag | htfBlock | htfPar },
86   { "object", htfRequireEndtag },
87   { "ol", htfRequireEndtag | htfBlock | htfPar },
88   { "optgroup", htfRequireEndtag },
89   { "option", htfAllowEndtag | htfSoakUpText | htfForbidPre },
90   { strP, htfAllowEndtag | htfBlock | htfPar },
91   { "pre", htfRequireEndtag | htfBlock | htfPar },
92   { "q", htfRequireEndtag }, /* stolen from J.B. & the next generation :-) */
93   { "s", htfRequireEndtag }, /* deprecated */
94   { "script", htfRequireEndtag | htfSoakUpText },
95   { strSelect, htfRequireEndtag },
96   { "strike", htfRequireEndtag }, /* deprecated */
97   { "strong", htfRequireEndtag },
98   { strStyle, htfRequireEndtag | htfSoakUpText },
99   { "sub", htfRequireEndtag },
100   { strSup, htfRequireEndtag },
101   { "table", htfRequireEndtag | htfBlock | htfPar },
102   { "td", htfAllowEndtag },
103   { "textarea", htfRequireEndtag | htfSoakUpText },
104   { "th", htfAllowEndtag },
105   { strTitle, htfRequireEndtag | htfSoakUpText | htfForbidPre },
106   { "tr", htfAllowEndtag | htfPar },
107   { "u", htfRequireEndtag }, /* deprecated */
108   { "ul", htfRequireEndtag | htfBlock | htfPar }
109 };
110 
111 #if OPTION_CED > 0
112 #define NUM_CHENT (141)
113 #define CED01(ced0, ced1) ced1
114 #define DECLCED1(name, ced1) { name, ced1 },
115 #define CODE01(index0, index1) index1
116 #else
117 #define NUM_CHENT (89)
118 #define CED01(ced0, ced1) ced0
119 #define DECLCED1(name, ced1)
120 #define CODE01(index0, index1) index0
121 static const char chentcode_rerouter[] = "AaCcEeIiNnOoUuYy";
122 #endif
123 
124 #define MAXLEN_CHENT (6)
125 static
126 #if OPTION_CED < 2
127   const
128 #else
129   const_after_init
130 #endif
131   struct
132 { const char* name; /* (sorted in strcmp() order) */
133   const char* result;
134 } chent[NUM_CHENT] =
135 { DECLCED1("AElig", "�") /*0*/
136   DECLCED1("Aacute", "�")
137   DECLCED1("Acirc", "�")
138   DECLCED1("Agrave", "�")
139   DECLCED1("Aring", "�")
140   DECLCED1("Atilde", "�") /*5*/
141   { "Auml", CED01("Ae", "�") },
142   DECLCED1("Ccedil", "�")
143   { "Dagger", strHm },
144   { "ETH", CED01("DH", "�") },
145   DECLCED1("Eacute", "�") /*10*/
146   DECLCED1("Ecirc", "�")
147   DECLCED1("Egrave", "�")
148   DECLCED1("Euml", "�")
149   DECLCED1("Iacute", "�")
150   DECLCED1("Icirc", "�") /*15*/
151   DECLCED1("Igrave", "�")
152   DECLCED1("Iuml", "�")
153   DECLCED1("Ntilde", "�")
154   DECLCED1("Oacute", "�")
155   DECLCED1("Ocirc", "�") /*20*/
156   DECLCED1("Ograve", "�")
157   DECLCED1("Oslash", "�")
158   DECLCED1("Otilde", "�")
159   { "Ouml", CED01("Oe", "�") },
160   { "Prime", "''" }, /*25*/
161   { "THORN", CED01("P", "�") },
162   DECLCED1("Uacute", "�")
163   DECLCED1("Ucirc", "�")
164   DECLCED1("Ugrave", "�")
165   { "Uuml", CED01("Ue", "�") }, /*30*/
166   DECLCED1("Yacute", "�")
167   DECLCED1("aacute", "�")
168   DECLCED1("acirc", "�")
169   { strAcute, CED01(strSingleQuote, "�") },
170   DECLCED1("aelig", "�") /*35*/
171   DECLCED1("agrave", "�")
172   { "amp", "&" },
173   { "apos", strSingleQuote },
174   DECLCED1("aring", "�")
175   DECLCED1("atilde", "�") /*40*/
176   { "auml", CED01("ae", "�") },
177   { "bdquo", strDoubleQuote },
178   DECLCED1("beta", "�")
179   { "boxv", strPipe },
180   { "brkbar", strPipe }, /*45*/
181   { "brvbar", strPipe },
182   { "bull", "o" },
183   DECLCED1("ccedil", "�")
184   { strCedil, CED01(",", "�") },
185   { "cent", CED01("-c-", "�") }, /*50*/
186   { strCirc, "^" },
187   { "copy", CED01("(C)", "�") },
188   { "curren", CED01("CUR", "�") },
189   { "darr", "v" },
190   { "deg", CED01("DEG", "�") }, /*55*/
191   { "divide", CED01("/", "�") },
192   DECLCED1("eacute", "�")
193   DECLCED1("ecirc", "�")
194   DECLCED1("egrave", "�")
195   { "eth", CED01("dh", "�") }, /*60*/
196   DECLCED1("euml", "�")
197   { "euro", "EUR" }, /* inconsequent, but ISO */
198   { "frac12", CED01(" 1/2", "�") },
199   { "frac14", CED01(" 1/4", "�") },
200   { "frac34", CED01(" 3/4", "�") }, /*65*/
201   { "frasl", "/" },
202   { "ge", ">=" },
203   { "gt", strGt },
204   { "hArr", "<=>" },
205   { "harr", "<->" }, /*70*/
206   DECLCED1("iacute", "�")
207   DECLCED1("icirc", "�")
208   { "iexcl", CED01("!", "�") },
209   DECLCED1("igrave", "�")
210   { "iquest", CED01(strQm, "�") }, /*75*/
211   DECLCED1("iuml", "�")
212   { "lArr", "<=" },
213   { "lang", strLt },
214   { "laquo", strDoubleQuote },
215   { "larr", "<-" }, /*80*/
216   { "ldquo", strDoubleQuote },
217   { "le", "<=" },
218   { "lsaquo", strSingleQuote },
219   { "lsquo", strSingleQuote },
220   { "lt", strLt }, /*85*/
221   { "mdash", strMinus },
222   { "middot", CED01(".", "�") },
223   { "minus", strMinus },
224   { "nbsp", strSpace }, /* FIXME! */
225   { "ndash", strMinus }, /*90*/
226   { "ne", "!=" }, /* CHECKME! */
227   { "not", CED01("NOT", "�") }, /* CHECKME! */
228   DECLCED1("ntilde", "�")
229   DECLCED1("oacute", "�")
230   DECLCED1("ocirc", "�") /*95*/
231   DECLCED1("ograve", "�")
232   { "oplus", "+" },
233   { "ordf", CED01("-a", "�") },
234   { "ordm", CED01("-o", "�") },
235   DECLCED1("oslash", "�") /*100*/
236   DECLCED1("otilde", "�")
237   { strOtimes, strAsterisk }, /* CHECKME! */
238   { "ouml", CED01("oe", "�") },
239   { "para", CED01("par.", "�") }, /* CHECKME! */
240   { "permil", "o/oo" }, /*105*/
241   { strPi, strPi },
242   { "plusmn", CED01("+/-", "�") },
243   { "pound", CED01("-L-", "�") },
244   { "prime", strSingleQuote },
245   { "quot", strDoubleQuote }, /*110*/
246   { "rArr", "=>" },
247   { "rang", strGt },
248   { "raquo", strDoubleQuote },
249   { "rarr", "->" },
250   { "rdquo", strDoubleQuote }, /*115*/
251   { "reg", CED01("(R)", "�") },
252   { "rsaquo", strSingleQuote },
253   { "rsquo", strSingleQuote },
254   { "sbquo", strSingleQuote },
255   { "sdot", CED01(strAsterisk, "�") }, /*120*/ /* CHECKME! */
256   { "sect", CED01("sect.", "�") }, /* CHECKME! */
257   { "shy", strEmpty },
258   { "sim", "~" },
259   { "sup1", CED01("^1", "�") },
260   { "sup2", CED01("^2", "�") }, /*125*/
261   { "sup3", CED01("^3", "�") },
262   { "szlig", CED01("ss", "�") },
263   { "thorn", CED01(strP, "�") },
264   { strTilde, "~" },
265   { strTimes, CED01(strAsterisk, "�") }, /*130*/
266   { "trade", "(tm)" },
267   DECLCED1("uacute", "�")
268   { "uarr", "^" },
269   DECLCED1("ucirc", "�")
270   DECLCED1("ugrave", "�") /*135*/
271   { strUml, CED01(strSpace, "�") },
272   { "uuml", CED01("ue", "�") },
273   DECLCED1("yacute", "�")
274   { "yen", CED01("-Y-", "�") },
275   DECLCED1("yuml", "�")
276 };
277 
278 typedef unsigned short tEntityCode;
279 #define NUM_CHENTCODE (139)
280 static const struct
281 { tEntityCode code; /* (sorted in numerical order) */
282   signed short index; /* chent[] or chentcode_rerouter[] index */
283 } chentcode[NUM_CHENTCODE] =
284 { { 34, CODE01(63, 110) },
285   { 38, CODE01(8, 37) },
286   { 39, CODE01(9, 38) },
287   { 60, CODE01(44, 85) },
288   { 62, CODE01(31, 68) },
289   { 160, CODE01(48, 89) },
290   { 161, CODE01(34, 73) },
291   { 162, CODE01(17, 50) },
292   { 163, CODE01(61, 108) },
293   { 164, CODE01(20, 53) },
294   { 165, CODE01(88, 139) },
295   { 166, CODE01(14, 46) },
296   { 167, CODE01(74, 121) },
297   { 168, CODE01(86, 136) },
298   { 169, CODE01(19, 52) },
299   { 170, CODE01(53, 98) },
300   { 171, CODE01(38, 79) },
301   { 172, CODE01(51, 92) },
302   { 173, CODE01(75, 122) },
303   { 174, CODE01(69, 116) },
304   { 176, CODE01(22, 55) },
305   { 177, CODE01(60, 107) },
306   { 178, CODE01(78, 125) },
307   { 179, CODE01(79, 126) },
308   { 180, CODE01(7, 34) },
309   { 182, CODE01(57, 104) },
310   { 183, CODE01(46, 87) },
311   { 184, CODE01(16, 49) },
312   { 185, CODE01(77, 124) },
313   { 186, CODE01(54, 99) },
314   { 187, CODE01(66, 113) },
315   { 188, CODE01(27, 64) },
316   { 189, CODE01(26, 63) },
317   { 190, CODE01(28, 65) },
318   { 191, CODE01(35, 75) },
319   { 192, CODE01(-1, 3) },
320   { 193, CODE01(-1, 1) },
321   { 194, CODE01(-1, 2) },
322   { 195, CODE01(-1, 5) },
323   { 196, CODE01(0, 6) },
324   { 197, CODE01(-1, 4) },
325   { 198, CODE01(-1, 0) },
326   { 199, CODE01(-3, 7) },
327   { 200, CODE01(-5, 12) },
328   { 201, CODE01(-5, 10) },
329   { 202, CODE01(-5, 11) },
330   { 203, CODE01(-5, 13) },
331   { 204, CODE01(-7, 16) },
332   { 205, CODE01(-7, 14) },
333   { 206, CODE01(-7, 15) },
334   { 207, CODE01(-7, 17) },
335   { 208, CODE01(2, 9) },
336   { 209, CODE01(-9, 18) },
337   { 210, CODE01(-11, 21) },
338   { 211, CODE01(-11, 19) },
339   { 212, CODE01(-11, 20) },
340   { 213, CODE01(-11, 23) },
341   { 214, CODE01(3, 24) },
342   { 215, CODE01(83, 130) },
343   { 216, CODE01(-11, 22) },
344   { 217, CODE01(-13, 29) },
345   { 218, CODE01(-13, 27) },
346   { 219, CODE01(-13, 28) },
347   { 220, CODE01(6, 30) },
348   { 221, CODE01(-15, 31) },
349   { 222, CODE01(5, 26) },
350   { 223, CODE01(80, 127) },
351   { 224, CODE01(-2, 36) },
352   { 225, CODE01(-2, 32) },
353   { 226, CODE01(-2, 33) },
354   { 227, CODE01(-2, 40) },
355   { 228, CODE01(10, 41) },
356   { 229, CODE01(-2, 39) },
357   { 230, CODE01(-2, 35) },
358   { 231, CODE01(-4, 48) },
359   { 232, CODE01(-6, 59) },
360   { 233, CODE01(-6, 57) },
361   { 234, CODE01(-6, 58) },
362   { 235, CODE01(-6, 61) },
363   { 236, CODE01(-8, 74) },
364   { 237, CODE01(-8, 71) },
365   { 238, CODE01(-8, 72) },
366   { 239, CODE01(-8, 76) },
367   { 240, CODE01(24, 60) },
368   { 241, CODE01(-10, 93) },
369   { 242, CODE01(-12, 96) },
370   { 243, CODE01(-12, 94) },
371   { 244, CODE01(-12, 95) },
372   { 245, CODE01(-12, 101) },
373   { 246, CODE01(56, 103) },
374   { 247, CODE01(23, 56) },
375   { 248, CODE01(-12, 100) },
376   { 249, CODE01(-14, 135) },
377   { 250, CODE01(-14, 132) },
378   { 251, CODE01(-14, 134) },
379   { 252, CODE01(87, 137) },
380   { 253, CODE01(-16, 138) },
381   { 254, CODE01(81, 128) },
382   { 255, CODE01(-16, 140) },
383   { 710, CODE01(18, 51) },
384   { 732, CODE01(82, 129) },
385   { 960, CODE01(59, 106) },
386   { 8211, CODE01(49, 90) },
387   { 8212, CODE01(45, 86) },
388   { 8216, CODE01(43, 84) },
389   { 8217, CODE01(71, 118) },
390   { 8218, CODE01(72, 119) },
391   { 8220, CODE01(40, 81) },
392   { 8221, CODE01(68, 115) },
393   { 8222, CODE01(11, 42) },
394   { 8225, CODE01(1, 8) },
395   { 8226, CODE01(15, 47) },
396   { 8240, CODE01(58, 105) },
397   { 8242, CODE01(62, 109) },
398   { 8243, CODE01(4, 25) },
399   { 8249, CODE01(42, 83) },
400   { 8250, CODE01(70, 117) },
401   { 8260, CODE01(29, 66) },
402   { 8364, CODE01(25, 62) },
403   { 8482, CODE01(84, 131) },
404   { 8592, CODE01(39, 80) },
405   { 8593, CODE01(85, 133) },
406   { 8594, CODE01(67, 114) },
407   { 8595, CODE01(21, 54) },
408   { 8596, CODE01(33, 70) },
409   { 8656, CODE01(36, 77) },
410   { 8658, CODE01(64, 111) },
411   { 8660, CODE01(32, 69) },
412   { 8722, CODE01(47, 88) },
413   { 8764, CODE01(76, 123) },
414   { 8800, CODE01(50, 91) },
415   { 8804, CODE01(41, 82) },
416   { 8805, CODE01(30, 67) },
417   { 8853, CODE01(52, 97) },
418   { 8855, CODE01(55, 102) },
419   { 8901, CODE01(73, 120) },
420   { 9001, CODE01(37, 78) },
421   { 9002, CODE01(65, 112) },
422   { 9474, CODE01(12, 44) }
423 };
424 
425 /* character entity suffix handling */
426 enum { csiMax=13, csiDash=9, csiSpace=10 }; /* "csi": chentsuffix[] index */
427 static const char* const chentsuffix[csiMax + 1] =
428 { strAcute, strCedil, strCirc, "grave", "lig", "ring", "slash", strTilde,
429   strUml, "dash", "sp", "caron", "comma", "cy"
430 };
431 static const unsigned char chentsuffixlen[csiMax + 1] =
432 { 5, 5, 4, 5, 3, 4, 5, 5, 3, 4, 2, 5, 5, 2 };
433 
434 static const struct
435 { const char* str; /* (sorted in alphabetical order) */
436   tAttributeName an;
437 } attrdata[] =
438 { { "action", anAction },
439   { "align", anAlign },
440   { strAlt, anAlt },
441   { "checked", anChecked },
442   { "class", anClass },
443   { "color", anColor },
444   { "content", anContent },
445   { "declare", anDeclare },
446   { strDisabled, anDisabled },
447   { "enctype", anEnctype },
448   { "face", anFace },
449   { strHref, anHref },
450   { "http-equiv", anHttpEquiv },
451   { "id", anId },
452   { strLabel, anLabel },
453   { "language", anLanguage },
454   { "maxlength", anMaxlength },
455   { "media", anMedia },
456   { "method", anMethod },
457   { "multiple", anMultiple },
458   { strName, anName },
459   { strReadonly, anReadonly },
460   { "selected", anSelected },
461   { strSize, anSize },
462   { "src", anSrc },
463   { strStyle, anStyle },
464   { strTitle, anTitle },
465   { strType, anType },
466   { strValue, anValue },
467   { "width", anWidth }
468 };
469 
470 /* which HTML attributes need value conversion (bitfield) */
471 static const unsigned char attrvalueconv[5] = { 106, 12, 145, 251, 1 };
472 /* end-autogenerated */
473 
474 enum
475 { hpsText = 0, hpsTag = 1, hpsAttrName = 2, hpsEquals = 3, hpsAttrValue = 4,
476   hpsDone = 5, hpsComment1 = 6, hpsComment2 = 7, hpsComment3 = 8
477 };
478 typedef unsigned char tHtmlParserState;
479 #define MAX_HPS (8)
480 
481 #if CONFIG_DEBUG
482 static const char* const hps_name[MAX_HPS + 1] =
483 { strText, "tag", "attrname", "=", "attrvalue", "done", "comment1", "comment2",
484   "comment3"
485 };
486 #endif
487 
488 static tHtmlParserState state;
489 static tAttributeName current_attr_name;
490 static tHtmlTagKind current_tagkind;
491 static char* current_unknown_tagname;
492 static tAttribute* current_attributes;
493 
494 static unsigned int bufsize, maxbufsize;
495 
496 static tBoolean is_endtag, tagblock_ends;
497 static unsigned char attrvalue_quotes; /* 0=none, 1=single-, 2=double-quotes */
498 
499 static tCantent* current_cantent;
500 static tBoolean is_current_node_valid, is_parsing_done, inside_select;
501 static tHtmlNode *current_node, *current_node_in_tree, *previous_node_in_tree,
502   *delayed_node, *select_node;
503 static const char* dataptr;
504 static char* buf;
505 static tContentblock *current_block, *lhpp_content;
506 static size_t current_block_sizeleft, lhpp_byte;
507 static tActiveElementBase curraebase, select_aebase, *aebase;
508 static tActiveElementNumber aenum, aemax;
509 
510 #define IS_WHITESPACE(ch) \
511   ( ((ch) == ' ') || ((ch) == '\t') || ((ch) == '\n') || ((ch) == '\r') )
512 
buf_append(const char ch)513 static void buf_append(const char ch)
514 { if (maxbufsize <= bufsize)
515   { maxbufsize += 1024;
516     buf = memory_reallocate(buf, maxbufsize, mapString);
517   }
518   buf[bufsize++] = ch;
519 }
520 
attr2htmlinputlength(const tAttribute * attr,tHtmlInputLength _default)521 static tHtmlInputLength attr2htmlinputlength(const tAttribute* attr,
522   tHtmlInputLength _default)
523 { tHtmlInputLength retval = _default;
524   if (attr != NULL)
525   { const char* value = attr->value;
526     if (value != NULL)
527     { int l;
528       my_atoi(value, &l, NULL, MAX_HTML_INPUT_LENGTH + 1);
529       if (l > MAX_HTML_INPUT_LENGTH) l = MAX_HTML_INPUT_LENGTH;
530       else if (l < 1) l = 1;
531       retval = (tHtmlInputLength) l;
532     }
533   }
534   return(retval);
535 }
536 
do_lookup_tagkind(void)537 static one_caller tMbsIndex do_lookup_tagkind(void)
538 { my_binary_search(0, NUM_TAGDATA - 1, strcmp(buf, tagdata[idx].name),
539     return(idx))
540 }
541 
lookup_tagkind(void)542 static one_caller tHtmlTagKind lookup_tagkind(void)
543 /* transforms a tag name string (in <buf>) to the corresponding tag kind
544    number */
545 { tMbsIndex idx = do_lookup_tagkind();
546   if (idx >= 0) return(((tHtmlTagKind) idx) + TAGOFFSET);
547   else return(htkInvalid);
548 }
549 
550 enum { ceisCopy = 0, ceisKind = 1, ceisNumkind = 2, ceisInside = 3 };
551 typedef unsigned char tCharacterEntityInterpreterState; /* (-: */
552 
553 enum { ekString = 0, ekDecNumber = 1, ekHexNumber = 2 };
554 typedef unsigned char tEntityKind;
555 
do_lookup_entity_string(const char * str,size_t len)556 static one_caller tMbsIndex do_lookup_entity_string(const char* str,size_t len)
557 { my_binary_search(0, NUM_CHENT - 1, strncmp(str, chent[idx].name, len),
558     return(idx))
559 }
560 
lookup_entity_string(const char * str,size_t len)561 static one_caller tMbsIndex lookup_entity_string(const char* str, size_t len)
562 { /* Try to find a "candidate": */
563   tMbsIndex retval = do_lookup_entity_string(str, len);
564   /* Check whether the candidate is okay: */
565   if ( (retval >= 0) && (strlen(chent[retval].name) != len) )
566     retval = INVALID_INDEX;
567   return(retval);
568 }
569 
570 #define cec(code1, code2) my_numcmp(code1, code2) /* compare entity codes */
571 
lookup_entity_code(tEntityCode code)572 static one_caller tMbsIndex lookup_entity_code(tEntityCode code)
573 { my_binary_search(0, NUM_CHENTCODE - 1, cec(code, chentcode[idx].code),
574     return(idx))
575 }
576 
guessed_entity(const char * str,size_t len,char ** _dest)577 static one_caller tBoolean guessed_entity(const char* str, size_t len,
578   char** _dest)
579 /* tries to "guess" the meaning of an unknown character entity by looking at
580    its prefix or suffix, tries to make the best out of that, and returns
581    whether all that worked */
582 { if (len < 4) goto out; /* can't do anything here */
583   if ( (!strncmp(str, strSup, 3)) && /* entity has prefix "sup" */
584        (! ( (len == 4) && (str[3] == 'e') ) ) ) /* and isn't "&supe;" */
585   { char* dest;
586     str += 3; len -= 3;
587     copy:
588     dest = *_dest;
589     while (len-- > 0) *dest++ = *str++;
590     *_dest = dest;
591     return(truE);
592   }
593   else /* check for suffices */
594   { const char* end = str + len;
595     unsigned short idx;
596     for (idx = 0; idx <= csiMax; idx++)
597     { size_t sufflen = (size_t) chentsuffixlen[idx];
598       if (len > sufflen)
599       { const char* suff = chentsuffix[idx];
600         if (!strncmp(end - sufflen, suff, sufflen))
601         { if (idx == csiDash) { str = strMinus; len = 1; }
602           else if (idx == csiSpace) { str = strSpace; len = 1; }
603           else len -= sufflen;
604           goto copy;
605         }
606       }
607     }
608   }
609   out:
610   return(falsE); /* didn't find anything */
611 }
612 
shall_interpret_chents_in_attrvalue(void)613 static one_caller tBoolean shall_interpret_chents_in_attrvalue(void)
614 { tBoolean retval = cond2boolean( (current_attr_name < NUM_ATTRNAMES) &&
615     (my_bit_test(attrvalueconv, current_attr_name)) );
616   if (retval)
617   { /* Have to handle some special cases. Let's thank the htmlspec writers for
618        this bogosity... */
619     if ( ( (current_tagkind == htkMeta) && (current_attr_name == anName) )
620 #if 0
621       /* These aren't yet implemented. */
622       || ( (current_tagkind == htkLi) && (current_attr_name == anValue) )
623       || ( (current_tagkind == htkSelect) && (current_attr_name == anSize) )
624 #endif
625        )
626       retval = falsE;
627   }
628   return(retval);
629 }
630 
interpret_character_entities(char * origdest,const char * src,tBoolean may_trim)631 static one_caller void interpret_character_entities(char* origdest,
632   const char* src, tBoolean may_trim)
633 { const char *start SHUT_UP_COMPILER(NULL), *start0 SHUT_UP_COMPILER(NULL);
634   char* dest = origdest;
635   tCharacterEntityInterpreterState ceis = ceisCopy;
636   tEntityKind kind SHUT_UP_COMPILER(ekString);
637   unsigned char lenleft SHUT_UP_COMPILER(0);
638   if (may_trim) { while (IS_WHITESPACE(*src)) src++; }
639   while (1)
640   { char ch = *src;
641     unsigned int _code;
642     switch (ceis)
643     {case ceisCopy: /* the most likely case */
644       if (ch == '&') { start0 = src; ceis = ceisKind; }
645       else { *dest++ = ch; if (ch == '\0') goto out; }
646       break;
647      case ceisKind: /* find out the "kind" of the entity */
648       if (ch == '#') ceis = ceisNumkind; /* it's some numeric kind */
649       else /* it's a string */
650       { kind = ekString; start = src; lenleft = MAXLEN_CHENT + 1;
651         ceis = ceisInside; goto inside;
652       }
653       break;
654      case ceisNumkind: /* find out whether it's decimal or hex */
655       lenleft = 4 + 1; ceis = ceisInside;
656       if ( (ch == 'x') || (ch == 'X') ) { kind = ekHexNumber; start = src + 1;}
657       else { kind = ekDecNumber; start = src; goto inside; }
658       break;
659      case ceisInside: /* "inside" the entity */
660       inside:
661       lenleft--;
662       if ( (ch == '\0') || (ch == ';') || (ch == ' ') || (ch == '&') ||
663            (!lenleft) )
664       { /* found an end-point */
665         if (src <= start + 1) /* can't have found anything useful - CHECKME! */
666         { postcopy:
667           while (start0 <= src) *dest++ = *start0++;
668           if (ch == '\0') goto out;
669         }
670         else if (kind == ekString)
671         { size_t len = src - start;
672           tMbsIndex idx = lookup_entity_string(start, len);
673           if (idx >= 0) /* found in list */
674           { const char* temp = chent[idx].result;
675             while (*temp) *dest++ = *temp++;
676           }
677           else if (!guessed_entity(start, len, &dest))
678             goto postcopy; /* no idea */
679         }
680         else if (kind == ekDecNumber)
681         { const char* temp = start;
682           tMbsIndex idx;
683           _code = 0;
684           while (temp < src)
685           { char c = *temp++;
686             if (my_isdigit(c)) _code = 10 * _code + (c - '0');
687             else goto postcopy; /* not a decimal number */
688           }
689           handle_code:
690           if (_code > chentcode[NUM_CHENTCODE - 1].code) goto postcopy;
691           idx = lookup_entity_code((tEntityCode) _code);
692           if (idx < 0)
693           { if ( (_code >= 32) &&
694 #if OPTION_CED == 0
695                  (_code < 127)
696 #else
697                  (_code <= 255) && (_code != 127)
698 #endif
699               )
700             { *dest++ = (char) _code; } /* interpreted as ASCII code */
701             else goto postcopy;
702           }
703           else
704           { const signed short i = chentcode[idx].index;
705 #if OPTION_CED == 0
706             if (i < 0) *dest++ = chentcode_rerouter[-1 - i];
707             else
708 #endif
709             { temp = chent[i].result;
710               while (*temp) *dest++ = *temp++;
711             }
712           }
713         }
714         else if (kind == ekHexNumber)
715         { const char* temp = start;
716           _code = 0;
717           while (temp < src)
718           { char c = *temp++;
719             unsigned int add;
720             if (my_isdigit(c))
721             { add = c - '0';
722               calc:
723               _code = 16 * _code + add;
724             }
725             else if ( (c >= 'a') && (c <= 'f') )
726             { add = c - 'a' + 10; goto calc; }
727             else if ( (c >= 'A') && (c <= 'F') )
728             { add = c - 'A' + 10; goto calc; }
729             else goto postcopy; /* not a hexadecimal number */
730           }
731           goto handle_code;
732         }
733         ceis = ceisCopy;
734       }
735       break;
736     }
737     if (ch == '\0') { *dest = '\0'; goto out; }
738     else src++;
739   }
740   out:
741   if (may_trim) /* remove trailing whitespace */
742   { dest = origdest + strlen(origdest) - 1; /* IMPROVEME? */
743     while (dest >= origdest)
744     { const char c = *dest;
745       if (!IS_WHITESPACE(c)) break;
746       *dest-- = '\0';
747     }
748   }
749 }
750 
find_and_detach_attribute(tAttribute ** list,tAttributeName name)751 static tAttribute* find_and_detach_attribute(tAttribute** list,
752   tAttributeName name)
753 /* searches and extracts an attribute of the given <name> from the <list> */
754 { tAttribute *a = *list, *b;
755   if (a == NULL) return(NULL);
756   if (a->name == name) { *list = a->next; a->next = NULL; return(a); }
757   while ( (b = a->next) != NULL )
758   { if (b->name == name) { a->next = b->next; b->next = NULL; return(b); }
759     a = b;
760   }
761   return(NULL);
762 }
763 
_find_and_detach_attribute(void ** list,tAttributeName name)764 static __my_inline tAttribute* _find_and_detach_attribute(void** list,
765   tAttributeName name)
766 { return(find_and_detach_attribute((tAttribute**) list, name));
767     /* nasty casting rubbish */
768 }
769 
770 #define __fada(list, name) find_and_detach_attribute(&list, name)
771 #define fada(name) __fada(current_attributes, name)
772 
773 #if CONFIG_JAVASCRIPT
fada_js(void ** __list)774 static one_caller tAttribute* fada_js(void** __list)
775 /* like find_and_detach_attribute(), but for Javascript-related attributes */
776 { tAttribute **_list = (tAttribute**) __list; /* nasty casting rubbish */
777   tAttribute *retval, *list = *_list;
778   if (list == NULL) retval = NULL;
779   else if (is_an_for_javascript(list->name))
780   { retval = list; *_list = retval->next; retval->next = NULL; }
781   else
782   { tAttribute *a = list, *next;
783     while ( (next = a->next) != NULL )
784     { if (is_an_for_javascript(next->name))
785       { a->next = next->next; retval = next; retval->next = NULL; goto out; }
786       a = next;
787     }
788     retval = NULL;
789   }
790   out:
791   return(retval);
792 }
793 #endif
794 
795 /* prepare curraebase */
796 #define set_caeb(_kind) \
797   do { my_memclr_var(curraebase); curraebase.kind = _kind; } while (0)
798 
799 /* move attribute value */
800 #define __moav(dest, attr) \
801   do { curraebase.dest = attr->value; attr->value = NULL; } while (0)
802 #define moavd(attr) __moav(data, attr)
803 #define moavr(attr) __moav(render, attr)
804 
805 /* deallocate an attribute and all associated data */
806 #define __deattr(a) do { __dealloc(a->value); memory_deallocate(a); } while (0)
807 #define deattr(a) do { if (a != NULL) __deattr(a); } while (0)
808 
809 #define NUM_INPUT_TYPE (10)
810 static const struct
811 { const char* name; /* (sorted in alphabetical order) */
812   tActiveElementKind kind;
813 } input_type[NUM_INPUT_TYPE] =
814 { { strButton, aekFormButton },
815   { strCheckbox, aekFormCheckbox },
816   { strFile, aekFormFile },
817   { "hidden", aekFormHidden },
818   { strImage, aekFormImage },
819   { "password", aekFormPassword },
820   { "radio", aekFormRadio },
821   { strReset, aekFormReset },
822   { strSubmit, aekFormSubmit },
823   { strText, aekFormText }
824 };
825 
do_lookup_input_type(const char * str)826 static one_caller tMbsIndex do_lookup_input_type(const char* str)
827 { my_binary_search(0, NUM_INPUT_TYPE - 1, streqcase3(str,
828     input_type[idx].name), return(idx))
829 }
830 
lookup_input_type(const tAttribute * attr)831 static one_caller tActiveElementKind lookup_input_type(const tAttribute* attr)
832 { tActiveElementKind retval = aekFormText; /* htmlspec default */
833   if (attr != NULL)
834   { const char* av = attr->value;
835     if ( (av != NULL) && (*av != '\0') ) /* non-empty attribute value */
836     { tMbsIndex idx = do_lookup_input_type(av);
837       if (idx >= 0) retval = input_type[idx].kind;
838       else retval = aekUnknown;
839     }
840   }
841   return(retval);
842 }
843 
844 #if CONFIG_JAVASCRIPT
lookup_javascript_event(void)845 static one_caller tMbsIndex lookup_javascript_event(void)
846 { const char* str;
847   size_t len = bufsize - 1;
848   if ( (len < JAVASCRIPT_MIN_EVENT_NAME_LENGTH) ||
849        (my_tolower(buf[0]) != 'o') || (my_tolower(buf[1]) != 'n') )
850     return(INVALID_INDEX);
851   str = buf + 2;
852   my_binary_search(0, JAVASCRIPT_MAX_EVENT_CODE, strcmp(str, strJek[idx]),
853     return(idx))
854 }
855 #endif
856 
use_curraebase(void)857 static one_caller tBoolean use_curraebase(void)
858 /* checks whether an active-element base should be created for the current node
859    and prepares that if so */
860 { tBoolean retval = falsE; /* the most likely result */
861   switch (current_tagkind)
862   { case htkA: case htkArea:
863     { tAttribute* h = fada(anHref);
864       if ( (h != NULL) && (h->value != NULL) && (h->value[0] != '\0') )
865       { set_caeb(aekLink); moavd(h); retval = truE; }
866       deattr(h);
867     }
868     break;
869     case htkFrame: case htkIframe:
870     { tAttribute *s = fada(anSrc), *t = fada(anTitle);
871       if ( (s != NULL) && (s->value != NULL) && (s->value[0] != '\0') )
872       { char* tv;
873         set_caeb(aekLink); moavd(s); retval = truE;
874         if ( (t != NULL) && ( (tv = t->value) != NULL ) && (*tv != '\0') )
875           t->value = NULL; /* detach */
876         else tv = my_strdup(_("[a frame]"));
877         curraebase.render = tv;
878       }
879       deattr(s); deattr(t);
880     }
881     break;
882     case htkInput:
883     { tAttribute *t = fada(anType), *n = fada(anName), *v = fada(anValue),
884         *s = fada(anSize), *m = fada(anMaxlength), *a = fada(anAlt),
885         *ch = fada(anChecked), *di = fada(anDisabled), *re = fada(anReadonly);
886       tActiveElementKind kind = lookup_input_type(t);
887       if (kind != aekUnknown)
888       { tActiveElementFlags flags = aefNone;
889         const char* render;
890         set_caeb(kind);
891         if ( (n != NULL) && (n->value != NULL) && (n->value[0] != '\0') )
892           moavd(n);
893         switch (kind)
894         { case aekFormSubmit:
895            render = _("Submit");
896            handle_render:
897            if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
898            { render = v->value; v->value = NULL; } /* explicit value given */
899            else render = my_strdup(render);
900            curraebase.render = render; break;
901           case aekFormReset:
902            render = _("Reset"); goto handle_render; /*@notreached@*/ break;
903           case aekFormButton:
904            render = _("[a push button]"); goto handle_render; /*@notreached@*/
905            break;
906           case aekFormImage:
907            if ( (a != NULL) && (a->value != NULL) && (a->value[0] != '\0') )
908            { curraebase.render = a->value; a->value = NULL; }
909            else curraebase.render = my_strdup(_("[a form image]"));
910            break;
911           case aekFormText: case aekFormPassword: case aekFormRadio:
912           case aekFormHidden: /* case aekFormFile: */
913            /* Privacy Note: for aekFormFile, we don't store the default value
914               because storing it might lead to an unwanted transmission of
915               local file contents (if the user submits the form without
916               recognizing that there is an aekFormFile element in it). See e.g.
917               RFC1867, 8. */
918            if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
919              moavr(v); /* default text */
920            break;
921         }
922 
923         if (has_input_length(kind))
924         { tHtmlInputLength max = attr2htmlinputlength(m,MAX_HTML_INPUT_LENGTH),
925             size = attr2htmlinputlength(s, 20);
926           if (size > max) size = max;
927           curraebase.size = size; curraebase.maxlength = max;
928         }
929 
930         if (ch != NULL) flags |= aefCheckedSelected;
931         if (di != NULL) flags |= aefDisabled;
932         if (re != NULL) flags |= aefReadonly;
933         curraebase.flags = flags; retval = truE;
934       }
935       deattr(t); deattr(n); deattr(v); deattr(s); deattr(m); deattr(a);
936       deattr(ch); deattr(di); deattr(re);
937     }
938     break;
939     case htkTextarea:
940     { tAttribute *n = fada(anName), *di = fada(anDisabled),
941         *re = fada(anReadonly);
942       tActiveElementFlags flags = aefNone;
943       set_caeb(aekFormText);
944       if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
945       if (di != NULL) flags |= aefDisabled;
946       if (re != NULL) flags |= aefReadonly;
947       curraebase.flags = flags; curraebase.size = 20; retval = truE;
948       deattr(n); deattr(di); deattr(re);
949     }
950     break;
951     case htkButton:
952     { tAttribute *t = fada(anType), *n = fada(anName), *v = fada(anValue),
953         *di = fada(anDisabled);
954       tActiveElementKind kind = aekFormSubmit; /* htmlspec default */
955       const char* temp;
956       if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
957       if ( (t != NULL) && ( (temp = t->value) != NULL ) )
958       { if (streqcase(temp, strButton)) kind = aekFormButton;
959         else if (streqcase(temp, strReset)) kind = aekFormReset;
960         else if (!streqcase(temp, strSubmit)) kind = aekUnknown;
961       }
962       if (kind != aekUnknown)
963       { tActiveElementFlags flags = aefButtonTag;
964         if (di != NULL) flags |= aefDisabled;
965         set_caeb(kind);
966         if (v != NULL) moavr(v);
967         curraebase.flags = flags; retval = truE;
968       }
969       deattr(t); deattr(n); deattr(v); deattr(di);
970     }
971     break;
972     case htkSelect:
973     { tAttribute *n = fada(anName), *mu = fada(anMultiple),
974         *di = fada(anDisabled);
975       tActiveElementFlags flags = aefNone;
976       set_caeb(aekFormSelect);
977       if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
978       if (mu != NULL) flags |= aefMultiple;
979       if (di != NULL) flags |= aefDisabled;
980       curraebase.flags = flags;
981       select_aebase = curraebase; inside_select = truE;
982       /* retval = falsE; -- yes, not "truE" here; this one is special... */
983       deattr(n); deattr(mu); deattr(di);
984     }
985     break;
986   }
987   return(retval);
988 }
989 
990 static my_inline __sallocator tAttribute* __callocator
create_attribute(const tAttributeName name)991   create_attribute(const tAttributeName name)
992 { tAttribute* retval = (tAttribute*) memory_allocate(sizeof(tAttribute),
993     mapOther);
994   retval->name = name;
995   return(retval);
996 }
997 
deallocate_attributes(const tAttribute * a)998 static my_inline void deallocate_attributes(const tAttribute* a)
999 /* deallocates the given attribute list */
1000 { while (a != NULL)
1001   { const tAttribute* next = a->next;
1002     __deattr(a); a = next;
1003   }
1004 }
1005 
deallocate_html_node(const tHtmlNode * node)1006 void deallocate_html_node(const tHtmlNode* node)
1007 { if (node->kind == htkText) __dealloc((const char*) (node->data));
1008   else deallocate_attributes((const tAttribute*) (node->data));
1009   memory_deallocate(node);
1010 }
1011 
deallocate_one_aebase(const tActiveElementBase * aeb)1012 void deallocate_one_aebase(const tActiveElementBase* aeb)
1013 { __dealloc(aeb->data);
1014   if (aeb->kind != aekFormSelect) __dealloc(aeb->render); /* the simple case */
1015   else
1016   { const tHtmlOption* o = (const tHtmlOption*) aeb->render;
1017     while (o != NULL)
1018     { const tHtmlOption* next = o->next;
1019       __dealloc(o->value); __dealloc(o->render); memory_deallocate(o);
1020       o = next;
1021     }
1022   }
1023 #if CONFIG_JAVASCRIPT
1024   javascript_remove_ehs(aeb->eh);
1025 #endif
1026 }
1027 
htk_has_flag(const tHtmlTagKind kind,const tHtmlTagFlags flag)1028 static my_inline tBoolean htk_has_flag(const tHtmlTagKind kind,
1029   const tHtmlTagFlags flag)
1030 { return(cond2boolean( (kind >= TAGOFFSET) && (kind < TAGOFFSET + NUM_TAGDATA)
1031     && (tagdata[kind - TAGOFFSET].flags & flag) ));
1032 }
1033 
htk_soaks_up_text(const tHtmlTagKind kind)1034 static __my_inline tBoolean htk_soaks_up_text(const tHtmlTagKind kind)
1035 { return(htk_has_flag(kind, htfSoakUpText));
1036 }
1037 
htk_forbids_endtag(const tHtmlTagKind kind)1038 __my_inline tBoolean htk_forbids_endtag(const tHtmlTagKind kind)
1039 { return(htk_has_flag(kind, htfForbidEndtag));
1040 }
1041 
htk_forbids_pre(const tHtmlTagKind kind)1042 __my_inline tBoolean htk_forbids_pre(const tHtmlTagKind kind)
1043 { return(htk_has_flag(kind, htfForbidPre));
1044 }
1045 
htk_is_block(const tHtmlTagKind kind)1046 __my_inline tBoolean htk_is_block(const tHtmlTagKind kind)
1047 { return(htk_has_flag(kind, htfBlock));
1048 }
1049 
htk_is_par(const tHtmlTagKind kind)1050 __my_inline tBoolean htk_is_par(const tHtmlTagKind kind)
1051 { return(htk_has_flag(kind, htfPar));
1052 }
1053 
create_html_form(const char * action,tHtmlFormFlags flags)1054 static one_caller void create_html_form(const char* action,
1055   tHtmlFormFlags flags)
1056 { tHtmlFormNumber num = current_cantent->hfnum, max = current_cantent->hfmax;
1057   tHtmlForm* f;
1058   if (num >= max)
1059   { max += ( (max >= 9) ? 10 : 3 ); current_cantent->hfmax = max;
1060     current_cantent->form = memory_reallocate(current_cantent->form,
1061       max * sizeof(tHtmlForm), mapOther);
1062   }
1063   f = &(current_cantent->form[num]); f->action_uri = action; f->flags = flags;
1064   f->first_ae = f->last_ae = INVALID_AE; current_cantent->hfnum = num + 1;
1065 #if CONFIG_DEBUG
1066   sprint_safe(debugstrbuf,
1067     "create_html_form(): num=%d, max=%d, action=*%s*, flags=%d\n",
1068     num, max, action, flags);
1069   debugmsg(debugstrbuf);
1070 #endif
1071 }
1072 
append_attribute_name(const tAttributeName name)1073 static void append_attribute_name(const tAttributeName name)
1074 /* appends an attribute of the given <name> to current_attributes (avoiding
1075    duplicates) */
1076 { tAttribute* a = fada(name);
1077   if (a != NULL) dealloc(a->value); /* "forget" old value */
1078   else a = create_attribute(name);
1079   a->next = current_attributes;
1080   current_attributes = a;
1081 }
1082 
set_current_node(tHtmlNode * node)1083 static __my_inline void set_current_node(tHtmlNode* node)
1084 { current_node = node;
1085   is_current_node_valid = truE;
1086 }
1087 
store_html_node(tHtmlNode * node,tBoolean do_skip_char)1088 static void store_html_node(tHtmlNode* node, tBoolean do_skip_char)
1089 /* stores the <node> in the tree, updates the lhpp_.... information and creates
1090    an active-element base if appropriate */
1091 { const tHtmlTagKind htk = node->kind;
1092   if (inside_select) /* don't store anything */
1093   { deallocate_html_node(node); return; }
1094 
1095   if (previous_node_in_tree != NULL)
1096   { previous_node_in_tree->next = node; previous_node_in_tree = node; }
1097   else
1098   { /* The <node> is the first one for the tree of the current resource: */
1099     current_cantent->tree = previous_node_in_tree = node;
1100   }
1101   node->flags |= hnfStoredInTree;
1102   lhpp_content = current_block;
1103   if (current_block == NULL)
1104   { /* This can e.g. happen if an HTML document ends with an opening <title>
1105        tag (incomplete document or just not yet completely received); in this
1106        case we have the call chain "parser_html_next() -> change_state(hpsDone)
1107        -> finish_delayed_node() -> store_html_node()". */
1108     lhpp_byte = 0;
1109   }
1110   else
1111   { lhpp_byte = current_block->used - current_block_sizeleft +
1112       boolean2bool(do_skip_char);
1113   }
1114 
1115   if (htk == htkTitle)
1116   { tAttribute* t = _find_and_detach_attribute(&(node->data), anInternalText);
1117     const char* tv;
1118     if ( (t != NULL) && ( (tv = t->value) != NULL ) && (*tv != '\0') )
1119     { __dealloc(current_cantent->major_html_title);
1120       current_cantent->major_html_title = tv; t->value = NULL;
1121     }
1122     deattr(t);
1123   }
1124 
1125   if (node->flags & hnfHasAeBase)
1126   { if (aenum >= aemax) /* need to allocate more memory */
1127     { aemax += aenum_incvalue(aemax);
1128       current_cantent->aebase = aebase = memory_reallocate(aebase, aemax *
1129         sizeof(tActiveElementBase), mapOther);
1130     }
1131 #if CONFIG_JAVASCRIPT
1132     { /* extract event handlers */
1133       const tJavascriptEventHandler* javascript_ehs = NULL;
1134       const tAttribute* a;
1135       while ( (a = fada_js(&(node->data))) != NULL )
1136       { const char* v = a->value;
1137         const tJavascriptCode* code;
1138         if ( (v != NULL) && (*v != '\0') &&
1139              ( (code = javascript_compile(v)) != NULL ) )
1140         { tJavascriptEventHandler* eh =
1141             javascript_create_eh(a->name - anJavascriptBegin, code);
1142           eh->next = javascript_ehs; javascript_ehs = eh;
1143         }
1144       }
1145       curraebase.eh = javascript_ehs;
1146     }
1147 #endif
1148     if (htk == htkTextarea)
1149     { tAttribute* t = _find_and_detach_attribute(&(node->data),anInternalText);
1150       if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1151         moavr(t);
1152       deattr(t);
1153     }
1154     aebase[aenum++] = curraebase;
1155   }
1156   set_current_node(node);
1157 #if CONFIG_EXDEBUG
1158   sprint_safe(debugstrbuf, "stored node %p, %d, %d\n", node, htk, node->flags);
1159   prsdbg(debugstrbuf);
1160 #endif
1161 }
1162 
prepare_text(tBoolean may_trim)1163 static __sallocator char* __callocator prepare_text(tBoolean may_trim)
1164 { char* text = (char*) __memory_allocate(bufsize, mapString);
1165   interpret_character_entities(text, buf, may_trim);
1166   return(text);
1167 }
1168 
finish_delayed_node(tBoolean got_text,tBoolean do_skip_char)1169 static one_caller void finish_delayed_node(tBoolean got_text,
1170   tBoolean do_skip_char)
1171 { tHtmlTagKind htk = delayed_node->kind;
1172   if ( (got_text) && (htk != htkScript) && (htk != htkStyle) )
1173   { /* add the text as an internal attribute */
1174     tAttribute* a = create_attribute(anInternalText);
1175     tBoolean may_trim = cond2boolean((htk == htkOption) || (htk == htkTitle));
1176     a->value = prepare_text(may_trim); a->next = delayed_node->data;
1177     delayed_node->data = a;
1178   }
1179   if ( (inside_select) && (htk == htkOption) )
1180   { /* add this option to the current <select> data */
1181     tAttribute *list = (tAttribute*) delayed_node->data,
1182       *v = __fada(list, anValue), *l = __fada(list, anLabel),
1183       *t = __fada(list, anInternalText),
1184       *se = __fada(list, anSelected), *di = __fada(list, anDisabled);
1185     tHtmlOption *option = __memory_allocate(sizeof(tHtmlOption), mapOther),
1186       *o = (tHtmlOption*) select_aebase.render;
1187     tHtmlOptionFlags hof = hofNone;
1188     char *value, *render;
1189 
1190     delayed_node->data = (void*) list; deallocate_html_node(delayed_node);
1191 
1192     /* What to render? */
1193     if ( (l != NULL) && (l->value != NULL) && (l->value[0] != '\0') )
1194     { render = l->value; l->value = NULL; }
1195     else if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1196     { render = my_strdup(t->value); }
1197     else render = my_strdup(_("[an option]"));
1198 
1199     /* What to submit? */
1200     if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
1201     { value = v->value; v->value = NULL; }
1202     else if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1203     { value = t->value; t->value = NULL; }
1204     else value = NULL;
1205 
1206     if (se != NULL) hof |= hofSelected;
1207     if (di != NULL) hof |= hofDisabled;
1208     option->next = NULL; option->value = value; option->render = render;
1209     option->flags = hof;
1210     if (o == NULL) select_aebase.render = (char*) option;
1211     else
1212     { while (o->next != NULL) o = o->next; /* IMPROVEME? */
1213       o->next = option;
1214     }
1215     select_aebase.maxlength++; /* option counter */
1216     deattr(v); deattr(l); deattr(t); deattr(se); deattr(di);
1217   }
1218   else if (!is_parsing_done) store_html_node(delayed_node, do_skip_char);
1219   else deallocate_html_node(delayed_node); /* just forget it */
1220   delayed_node = NULL; /* done with this one */
1221 }
1222 
deallocate_current_attributes(void)1223 static my_inline void deallocate_current_attributes(void)
1224 { if (current_attributes != NULL)
1225   { deallocate_attributes(current_attributes); current_attributes = NULL; }
1226 }
1227 
handle_meta_tag(void)1228 static one_caller void handle_meta_tag(void)
1229 { const tAttribute *n1 = fada(anName), *n2 = fada(anHttpEquiv), *n,
1230     *c = fada(anContent);
1231   const char* cv;
1232   if ( (c == NULL) || ( (cv = c->value) == NULL ) ) goto out; /* no content */
1233   n = ( (n1 != NULL) ? n1 : n2 );
1234   if ( (n == NULL) || (n->value == NULL) ) goto out; /* no name */
1235   if (!streqcase(n->value, "refresh")) goto out; /* unknown name */
1236 
1237   if (current_cantent->redirection != NULL)
1238   { /* don't override HTTP redirection - the dynamic server usually knows
1239        better than the static HTML document does */
1240     goto out;
1241   }
1242   /* The refresh content isn't "really" standardized, so let's be careful: */
1243   while (IS_WHITESPACE(*cv)) cv++;
1244   if (my_isdigit(*cv))
1245   { /* We skip and ignore the number of seconds. It is often 0 anyway, and if
1246        it _isn't_ 0 it often _should_ be 0 in order not to waste users' time,
1247        and users can get back to the original document easily with a single
1248        keyboard command at any later time. So we redirect immediately... */
1249     int dummy;
1250     my_atoi(cv, &dummy, &cv, 9);
1251     while (IS_WHITESPACE(*cv)) cv++;
1252   }
1253   if ( (*cv == ',') || (*cv == ';') ) cv++;
1254   while (IS_WHITESPACE(*cv)) cv++;
1255   if (strneqcase(cv, "url=", 4)) { cv += 4; while (IS_WHITESPACE(*cv)) cv++; }
1256   if (*cv != '\0')
1257   { current_cantent->redirection = my_strdup(cv);
1258     current_cantent->caf |= cafHtmlRedirection;
1259 #if CONFIG_DEBUG
1260     { char* spfbuf;
1261       my_spf(debugstrbuf, STRBUF_SIZE, &spfbuf, "HTML redirection: *%s*\n",cv);
1262       debugmsg(spfbuf); my_spf_cleanup(debugstrbuf, spfbuf);
1263     }
1264 #endif
1265   }
1266   out:
1267   deattr(n1); deattr(n2); deattr(c);
1268 }
1269 
build_html_node(void)1270 static void build_html_node(void)
1271 { tHtmlNode* node = memory_allocate(sizeof(tHtmlNode), mapHtmlNode);
1272   const tBoolean is_textual = cond2boolean(current_tagkind == htkText),
1273     ucb = ( (is_endtag || is_textual || is_parsing_done) ? falsE
1274       : use_curraebase() );
1275   tHtmlNodeFlags hnf = hnfNone;
1276   if (is_endtag) hnf |= hnfIsEndtag;
1277   if (tagblock_ends) hnf |= hnfTagblockEnds;
1278 
1279   /* Build the node: */
1280   if (is_textual) node->data = prepare_text(falsE);
1281   else
1282   { if (current_tagkind == htkInvalid)
1283     { append_attribute_name(anInternalTagname);
1284       current_attributes->value = current_unknown_tagname;
1285         /* CHECKME: that's unclean! */
1286       current_unknown_tagname = NULL;
1287     }
1288     else if ( (!is_endtag) && (!is_parsing_done) )
1289     { if (current_tagkind == htkCenter) hnf |= hnfAlignCenter;
1290       else if (current_tagkind == htkMeta) handle_meta_tag();
1291       else if (current_tagkind == htkForm)
1292       { tAttribute *a = fada(anAction), *m = fada(anMethod),
1293           *e = fada(anEnctype);
1294         if ( (a != NULL) && (a->value != NULL) && (a->value[0] != '\0') )
1295         { /* seems we have an "action" attribute, let's create a form */
1296           tHtmlFormFlags ff = hffNone;
1297           hnf |= hnfGoodForm;
1298           if ((m != NULL) && (m->value != NULL) && streqcase(m->value, "post"))
1299             ff |= hffMethodPost;
1300           if ( (e != NULL) && (e->value != NULL) &&
1301                (streqcase(e->value, "multipart/form-data")) )
1302             ff |= hffEncodingMultipart;
1303           create_html_form(a->value, ff);
1304           a->value = NULL;
1305         }
1306         deattr(a); deattr(m); deattr(e);
1307       }
1308     }
1309 
1310     /* convert some general attributes to a more efficient representation */
1311     { const tAttribute* a = fada(anAlign);
1312       const char* av;
1313       if ( (a == NULL) /* most likely */ || (current_tagkind == htkTable) ||
1314            (current_tagkind == htkTh) || (current_tagkind == htkTd) )
1315         goto dont_align; /* CHECKME! */
1316       if ( ( (av = a->value) != NULL ) && (*av != '\0') )
1317       { if (streqcase(av, "left")) hnf |= hnfAlignLeft;
1318         else if (streqcase(av, strCenter)) hnf |= hnfAlignCenter;
1319         else if (streqcase(av, "right")) hnf |= hnfAlignRight;
1320       }
1321       dont_align: {}
1322       deattr(a);
1323     }
1324     node->data = current_attributes; current_attributes = NULL; /* detach */
1325   }
1326   node->kind = current_tagkind; node->flags = hnf; node->next = NULL;
1327 
1328   /* Now decide what to do with the node: */
1329   if ( (!is_endtag) && (htk_soaks_up_text(current_tagkind)) )
1330   { if (current_tagkind == htkTextarea) node->flags |= hnfHasAeBase;
1331     delayed_node = node; /* We wanna soak up any immediately following text. */
1332   }
1333   else if (current_tagkind == htkSelect)
1334   { if (!is_endtag) select_node = node;
1335     else if (inside_select) /* finish and store the <select> info */
1336     { tHtmlOption *o, *o0;
1337       tBoolean found_selected_option = falsE,
1338         is_multiple = cond2boolean(select_aebase.flags & aefMultiple);
1339       inside_select = falsE; curraebase = select_aebase;
1340       select_node->flags |= hnfHasAeBase;
1341       /* Make sure one option is selected (and _only_ one if non-multiple) */
1342       o = o0 = (tHtmlOption*) curraebase.render;
1343       while (o != NULL)
1344       { if (o->flags & hofSelected)
1345         { if (found_selected_option)
1346           { /* more than one option selected in a non-multiple <select> */
1347             o->flags &= ~hofSelected;
1348           }
1349           else
1350           { found_selected_option = truE;
1351             if (is_multiple) break; /* don't care about further selections */
1352           }
1353         }
1354         o = o->next;
1355       }
1356       if ( (!found_selected_option) && (o0 != NULL) ) o0->flags |= hofSelected;
1357       /* Store the <select> node */
1358       store_html_node(select_node, truE); select_node = NULL;
1359     }
1360   }
1361   else if (!is_parsing_done) /* the most likely case */
1362   { tBoolean do_skip_char = cond2boolean(!is_textual);
1363     if (ucb) node->flags |= hnfHasAeBase;
1364     store_html_node(node, do_skip_char);
1365   }
1366   else if (is_textual)
1367   { /* Just imagine the HTML document currently ends with a longish text - the
1368        user should see it while receiving. Or imagine that a web page author
1369        simply forgot something like "</body></html>" after the last text run in
1370        a document... */
1371     set_current_node(node);
1372   }
1373   else deallocate_html_node(node); /* just forget it */
1374   deallocate_current_attributes();
1375 }
1376 
do_lookup_attrname(void)1377 static one_caller tMbsIndex do_lookup_attrname(void)
1378 { my_binary_search(0, ARRAY_ELEMNUM(attrdata) - 1,
1379     strcmp(buf, attrdata[idx].str), return(idx))
1380 }
1381 
lookup_attrname(void)1382 static one_caller tAttributeName lookup_attrname(void)
1383 { tMbsIndex idx = do_lookup_attrname();
1384   return( (idx < 0) ? anUnknown : attrdata[idx].an );
1385 }
1386 
1387 #define aai(cond) if (cond) current_attr_name = an /* "accept <an> if" */
1388 
change_state(tHtmlParserState new_state)1389 static void change_state(tHtmlParserState new_state)
1390 { tAttributeName an;
1391   buf_append('\0'); /* (for simplicity) */
1392   if (new_state == hpsDone) is_parsing_done = truE;
1393 
1394   switch (state)
1395   {case hpsText: /* reached the end of a text run */
1396     if (delayed_node != NULL)
1397       finish_delayed_node(cond2boolean(bufsize > 1), falsE);
1398     else if (bufsize > 1) /* got some text for a "normal" text node */
1399     { current_tagkind = htkText; build_html_node(); }
1400     break;
1401    case hpsTag:
1402     /* We store only tags we can make sense of. (I.e. we'll store htkInvalid
1403        tags only if they have some of the "general" attributes.) */
1404     current_tagkind = htkInvalid; /* default */
1405     if (bufsize > 1)
1406     { if (!strcmp(buf, strCommentTag)) new_state = hpsComment1;
1407       else
1408       { current_tagkind = lookup_tagkind(); __dealloc(current_unknown_tagname);
1409         current_unknown_tagname = ( (current_tagkind != htkInvalid) ? NULL :
1410           my_strdup(buf) );
1411       }
1412     }
1413     break;
1414    case hpsAttrName:
1415     /* We store only attributes we can make sense of. */
1416     current_attr_name = anUnknown; /* default */
1417     if (is_endtag) goto an_ignore; /* no attributes allowed */
1418     an = lookup_attrname();
1419     if (an == anUnknown) goto an_ignore;
1420 
1421     /* general attributes (allowed for almost all tags) */
1422     aai( (an == anAlign) || (an == anId) );
1423 #if CONFIG_CSS
1424     else aai( (an == anClass) || (an == anStyle) );
1425 #endif
1426 #if CONFIG_JAVASCRIPT
1427     else
1428     { tMbsIndex idx = lookup_javascript_event();
1429       if (idx >= 0)
1430         current_attr_name = ((tAttributeName) idx) + anJavascriptBegin;
1431     }
1432 #endif
1433     if (current_attr_name != anUnknown) goto an_append;
1434 
1435     /* specific attributes */
1436     if (current_tagkind == htkInvalid) goto an_ignore;
1437     switch (current_tagkind)
1438     {case htkA: aai( (an == anHref) || (an == anName) ); break;
1439      case htkArea: aai( (an == anHref) || (an == anAlt) ); break;
1440      case htkButton:
1441       aai( (an == anType) || (an == anName) || (an == anValue) ||
1442            (an == anDisabled) );
1443       break;
1444      case htkForm:
1445       aai( (an == anAction) || (an == anMethod) || (an == anEnctype) ); break;
1446      case htkFrame: case htkIframe:
1447       aai( (an == anSrc) || (an == anTitle) ); break;
1448      case htkHr: aai(an == anWidth); break;
1449      case htkImg: aai( (an == anAlt) || (an == anSrc) ); break;
1450      case htkInput:
1451       aai( (an == anType) || (an == anName) || (an == anValue) || (an == anAlt)
1452            || (an == anSize) || (an == anMaxlength) || (an == anChecked) ||
1453            (an == anDisabled) || (an == anReadonly) );
1454       break;
1455      case htkObject: aai( (an == anType) || (an == anDeclare) ); break;
1456      case htkOptgroup: aai( (an == anLabel) || (an == anDisabled) ); break;
1457      case htkOption:
1458       aai( (an == anValue) || (an == anLabel) || (an == anSelected) ||
1459            (an == anDisabled) );
1460       break;
1461      case htkSelect:
1462       aai( (an == anName) || (an == anMultiple) || (an == anDisabled) ); break;
1463      case htkTextarea:
1464       aai( (an == anName) || (an == anDisabled) || (an == anReadonly) ); break;
1465      case htkMeta:
1466       aai( (an == anName) || (an == anHttpEquiv) || (an == anContent) ); break;
1467      case htkFont:
1468       aai(an == anColor);
1469 #if TGC_IS_GRAPHICS
1470       else aai( (an == anSize) || (an == anFace) );
1471 #endif
1472       break;
1473 #if CONFIG_CSS
1474      case htkStyle: aai(an == anMedia); break;
1475 #endif
1476     }
1477 
1478     if (current_attr_name != anUnknown)
1479     { an_append: append_attribute_name(current_attr_name); }
1480     an_ignore: {}
1481     break;
1482    case hpsAttrValue:
1483     if ( (bufsize > 1) && (current_attr_name != anUnknown) &&
1484          (current_attributes != NULL) &&
1485          (current_attributes->name == current_attr_name) )
1486     { /* (The two latter tests "should" be unnecessary.) */
1487       current_attributes->value = (shall_interpret_chents_in_attrvalue() ?
1488         prepare_text(truE) : my_strdup(buf));
1489     }
1490     break;
1491   }
1492 
1493 #if CONFIG_DEBUG
1494   if (! ( ( (state == hpsText) || (state == hpsAttrName) ) && (bufsize < 2) ) )
1495   { char* spfbuf;
1496     my_spf(debugstrbuf, STRBUF_SIZE, &spfbuf, "%s: *%s%s*\n", hps_name[state],
1497       ( ( (state == hpsTag) && (is_endtag) ) ? (strSlash) : (strEmpty) ),
1498       ( (bufsize > 1) ? (buf) : ("(nothing)") ));
1499     prsdbg(spfbuf); my_spf_cleanup(debugstrbuf, spfbuf);
1500   }
1501 #endif
1502 
1503   state = new_state; bufsize = 0;
1504   if (state == hpsAttrValue) attrvalue_quotes = 0;
1505   else if (state == hpsText) /* reached the end of an HTML tag */
1506   { if ( (current_tagkind != htkInvalid) || (current_attributes != NULL) )
1507     { /* reached the end of 1. a _known_ HTML tag or 2. an htkInvalid tag with
1508          "general" attributes */
1509       build_html_node();
1510     }
1511   }
1512 }
1513 
1514 #undef aai
1515 
parser_html_start(tCantent * cantent)1516 void parser_html_start(tCantent* cantent)
1517 /* prepares the parser for parsing the <cantent> */
1518 { size_t parsedsize, usedsize;
1519 
1520   current_cantent = cantent;
1521   current_node_in_tree = (tHtmlNode*) cantent->tree;
1522   current_node = delayed_node = select_node = previous_node_in_tree = NULL;
1523 
1524   /* If we haven't yet parsed the whole content, we might reach a point where
1525      we actually have to parse something, so we setup the parser here: */
1526   current_block = lhpp_content = cantent->lhpp_content;
1527   if (current_block != NULL)
1528   { parsedsize = lhpp_byte = cantent->lhpp_byte;
1529     dataptr = current_block->data + parsedsize;
1530     usedsize = current_block->used;
1531     if (parsedsize >= usedsize) current_block_sizeleft = 0;
1532     else current_block_sizeleft = usedsize - parsedsize;
1533   }
1534   else
1535   { parsedsize = lhpp_byte = cantent->lhpp_byte;
1536     dataptr = NULL; usedsize = 0; current_block_sizeleft = 0;
1537   }
1538   aebase = cantent->aebase; aenum = cantent->aenum; aemax = cantent->aemax;
1539 
1540   state = hpsText;
1541   buf = current_unknown_tagname = NULL;
1542   maxbufsize = bufsize = 0;
1543 
1544   current_tagkind = htkInvalid; current_attr_name = anUnknown;
1545   current_attributes = NULL;
1546   is_current_node_valid = is_parsing_done = inside_select = falsE;
1547 
1548 #if CONFIG_DEBUG
1549   sprint_safe(debugstrbuf, "\nparser_html_start(): %p,%p,%p,%d,%d\n",
1550     cantent, current_block, dataptr, parsedsize, current_block_sizeleft);
1551   prsdbg(debugstrbuf);
1552 #endif
1553 }
1554 
parser_html_next(tBoolean inside_pre)1555 const tHtmlNode* parser_html_next(tBoolean inside_pre)
1556 /* returns the next node (or NULL) for the currently parsed resource */
1557 { tBoolean found_whitespace = falsE;
1558   char ch;
1559   if (current_node_in_tree != NULL)
1560   { /* need not actually parse something, found a node inside the tree
1561        (generated during earlier passes) */
1562     tHtmlNode* retval = previous_node_in_tree = current_node_in_tree;
1563     current_node_in_tree = current_node_in_tree->next;
1564     return(retval);
1565   }
1566 
1567   /* Reached the end of the already generated tree, have to parse: */
1568   loop:
1569   if (is_current_node_valid)
1570   { if (is_parsing_done)
1571     { tHtmlNode* retval = current_node; current_node = NULL; return(retval); }
1572     else { is_current_node_valid = falsE; return(current_node); }
1573   }
1574   if (current_block_sizeleft <= 0)
1575   { if (current_block != NULL) current_block = current_block->next;
1576     if (current_block != NULL)
1577     { dataptr = current_block->data;
1578       current_block_sizeleft = current_block->used;
1579     }
1580     else /* reached the end of the last content block in the list */
1581     { change_state(hpsDone);
1582       if (!is_current_node_valid)
1583       { current_node = NULL; is_current_node_valid = truE; } /* (extra care) */
1584     }
1585     goto loop;
1586   }
1587   ch = *dataptr++;
1588   switch (state)
1589   {case hpsText:
1590     if (ch == '<')
1591     { if (found_whitespace) { buf_append(' '); found_whitespace = falsE; }
1592       change_state(hpsTag); is_endtag = falsE;
1593     }
1594     else if (!inside_pre)
1595     { if (IS_WHITESPACE(ch)) found_whitespace = truE;
1596       else
1597       { if (found_whitespace) { buf_append(' '); found_whitespace = falsE; }
1598         buf_append(ch);
1599       }
1600     }
1601     else buf_append(ch);
1602     break;
1603    case hpsTag:
1604     if (IS_WHITESPACE(ch))
1605     { if (bufsize > 0) { change_state(hpsAttrName); tagblock_ends = falsE; } }
1606     else if (ch == '>') change_state(hpsText);
1607     else if (ch == '/')
1608     { if (bufsize == 0) is_endtag = truE;
1609       else { change_state(hpsAttrName); tagblock_ends = truE; }
1610     }
1611     else
1612     { buf_append(my_tolower(ch)); /* case-insensitivity: htmlspec, 3.2.1 */
1613       if ( (bufsize == 3) && (!strncmp(buf, strCommentTag, 3)) )
1614         change_state(hpsComment1);
1615     }
1616     break;
1617    case hpsAttrName:
1618     if (IS_WHITESPACE(ch)) { if (bufsize > 0) change_state(hpsEquals); }
1619     else if (ch == '=') change_state(hpsAttrValue);
1620     else if (ch == '>') change_state(hpsText);
1621     else if (ch == '/') tagblock_ends = truE; /* CHECKME: chg..(hpsAttrName)?*/
1622     else
1623     { attr_name_append:
1624       buf_append(my_tolower(ch)); /* case-insensitivity: htmlspec, 3.2.2 */
1625     }
1626     break;
1627    case hpsEquals:
1628     if (IS_WHITESPACE(ch)) { /* nothing */ }
1629     else if (ch == '=') change_state(hpsAttrValue);
1630     else if (ch == '>') change_state(hpsText);
1631     else { change_state(hpsAttrName); goto attr_name_append; } /* no value */
1632     break;
1633    case hpsAttrValue:
1634     if (IS_WHITESPACE(ch))
1635     { if (attrvalue_quotes != 0) buf_append(ch);
1636       else if (bufsize > 0) change_state(hpsAttrName);
1637     }
1638     else if (ch == '"')
1639     { if ( (attrvalue_quotes == 0) && (bufsize == 0) ) attrvalue_quotes = 2;
1640       else if (attrvalue_quotes == 2) change_state(hpsAttrName); /* value end*/
1641       else buf_append(ch);
1642     }
1643     else if (ch == '\'')
1644     { if ( (attrvalue_quotes == 0) && (bufsize == 0) ) attrvalue_quotes = 1;
1645       else if (attrvalue_quotes == 1) change_state(hpsAttrName); /* value end*/
1646       else buf_append(ch);
1647     }
1648     else if ( (ch == '>') && (attrvalue_quotes == 0) ) change_state(hpsText);
1649     else buf_append(ch);
1650     break;
1651    /* For most of the hpsComment states, we need not call change_state()
1652       because these are rather some kind of "sub-states": */
1653    case hpsComment1:
1654     if (ch == '-') state = hpsComment2;
1655     break;
1656    case hpsComment2:
1657     if (ch == '-') state = hpsComment3;
1658     else state = hpsComment1;
1659     break;
1660    case hpsComment3:
1661     if (ch == '>') change_state(hpsText);
1662     else if ( (!IS_WHITESPACE(ch)) && (ch != '-') ) state = hpsComment1;
1663     /* "else": stick to hpsComment3! htmlspec, 3.2.4: "White space is not per-
1664        mitted between the markup declaration open delimiter("<!") and the com-
1665        ment open delimiter ("--"), but is permitted between the comment close
1666        delimiter ("--") and the markup declaration close delimiter (">")."
1667        Additionally, we leniently allow extra "-" characters because a web page
1668        author might accidentally write e.g. "--->" instead of "-->"... */
1669     break;
1670   }
1671   current_block_sizeleft--;
1672   goto loop;
1673 }
1674 
parser_html_finish(void)1675 void parser_html_finish(void)
1676 /* finishes the parsing of the current cantent */
1677 { if (inside_select)
1678   { deallocate_one_aebase(&select_aebase);
1679     if (select_node != NULL) deallocate_html_node(select_node);
1680   }
1681   if (delayed_node != NULL) deallocate_html_node(delayed_node);
1682   deallocate_attributes(current_attributes);
1683   current_cantent->lhpp_content = lhpp_content;
1684   current_cantent->lhpp_byte = lhpp_byte;
1685   current_cantent->aebase = aebase;
1686   current_cantent->aenum = aenum; current_cantent->aemax = aemax;
1687   __dealloc(buf);
1688   i18n_cleanup /* FIXME on interface change: only do this if parser usedepth is
1689     zero! */
1690 }
1691 
parser_initialize(void)1692 one_caller void __init parser_initialize(void)
1693 {
1694 #if CONFIG_DEBUG
1695   static const char headline[] = "retawq " RETAWQ_VERSION
1696     " HTML parser debugging file (<http://retawq.sourceforge.net/>)\n";
1697   fd_parsertest = my_create("htmldebug.txt", O_CREAT | O_TRUNC | O_WRONLY,
1698     S_IRUSR | S_IWUSR);
1699   if (fd_parsertest < 0)
1700     fatal_error(errno, "can't create HTML parser debugging file");
1701   make_fd_cloexec(fd_parsertest);
1702   prsdbg(headline);
1703 #endif
1704 }
1705