1 /* retawq/parser.c - content parsing (HTML, ...)
2 This file is part of retawq (<http://retawq.sourceforge.net/>), a network
3 client created by Arne Thomassen; retawq is basically released under certain
4 versions of the GNU General Public License and WITHOUT ANY WARRANTY.
5 Read the file COPYING for license details, README for program information.
6 Copyright (C) 2001-2005 Arne Thomassen <arne@arne-thomassen.de>
7 */
8
9 #include "stuff.h"
10 #include "parser.h"
11
12 declare_local_i18n_buffer
13 #if CONFIG_DEBUG
14 static const_after_init int fd_parsertest;
15 static char debugstrbuf[STRBUF_SIZE];
16 #define prsdbg(msg) my_write_str(fd_parsertest, msg)
17 #endif
18
19 static const char strCommentTag[] = "!--", strSingleQuote[] = "'",
20 strPipe[] = "|", strHref[] = "href", strType[] = "type",
21 strName[] = "name", strValue[] = "value", strDisabled[] = "disabled",
22 strTitle[] = "title", strStyle[] = "style", strReadonly[] = "readonly",
23 strAlt[] = "alt", strLabel[] = "label", strSize[] = "size",
24 strCenter[] = "center", strP[] = "p", strPi[] = "pi", strOtimes[] = "otimes";
25 #define strTimes (strOtimes + 1)
26
27 static const char strAcute[] = "acute", strCedil[] = "cedil",
28 strCirc[] = "circ", strTilde[] = "tilde", strUml[] = "uml", strSup[] = "sup";
29
30 my_enum1 enum
31 { htfNone = 0, htfRequireEndtag = 0x01, htfAllowEndtag = 0x02,
32 htfForbidEndtag = 0x04, htfForbidPre = 0x08, htfBlock = 0x10,
33 htfPar = 0x20, htfSoakUpText = 0x40
34 } my_enum2(unsigned char) tHtmlTagFlags;
35
36 /* begin-autogenerated */
37 #define TAGOFFSET (1)
38 #define NUM_TAGDATA (66)
39 static const struct
40 { const char* name; /* (sorted in alphabetical order) */
41 tHtmlTagFlags flags;
42 } tagdata[NUM_TAGDATA] =
43 { { strA, htfRequireEndtag },
44 { "address", htfRequireEndtag | htfBlock | htfPar },
45 { "area", htfForbidEndtag },
46 { "b", htfRequireEndtag },
47 { "big", htfRequireEndtag },
48 { "blockquote", htfRequireEndtag | htfBlock | htfPar },
49 { "body", htfAllowEndtag },
50 { "br", htfForbidEndtag },
51 { strButton, htfRequireEndtag },
52 { "caption", htfRequireEndtag | htfPar },
53 { strCenter, htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
54 { "cite", htfRequireEndtag },
55 { "dd", htfAllowEndtag | htfBlock },
56 { "del", htfRequireEndtag },
57 { "dfn", htfRequireEndtag },
58 { "dir", htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
59 { "div", htfRequireEndtag | htfBlock | htfPar },
60 { "dl", htfRequireEndtag | htfBlock | htfPar },
61 { "dt", htfAllowEndtag | htfBlock },
62 { "em", htfRequireEndtag },
63 { "fieldset", htfRequireEndtag | htfBlock | htfPar },
64 { "font", htfRequireEndtag }, /* deprecated */
65 { "form", htfRequireEndtag | htfBlock | htfPar },
66 { "frame", htfForbidEndtag | htfBlock },
67 { "frameset", htfRequireEndtag | htfBlock | htfPar },
68 { "h1", htfRequireEndtag | htfBlock | htfPar },
69 { "h2", htfRequireEndtag | htfBlock | htfPar },
70 { "h3", htfRequireEndtag | htfBlock | htfPar },
71 { "h4", htfRequireEndtag | htfBlock | htfPar },
72 { "h5", htfRequireEndtag | htfBlock | htfPar },
73 { "h6", htfRequireEndtag | htfBlock | htfPar },
74 { "head", htfAllowEndtag | htfForbidPre },
75 { "hr", htfForbidEndtag | htfBlock },
76 { strHtml, htfAllowEndtag },
77 { "i", htfRequireEndtag },
78 { "iframe", htfForbidEndtag | htfBlock },
79 { "img", htfForbidEndtag },
80 { "input", htfForbidEndtag },
81 { "li", htfAllowEndtag | htfBlock },
82 { "menu", htfRequireEndtag | htfBlock | htfPar }, /* deprecated */
83 { "meta", htfForbidEndtag },
84 { "noframes", htfRequireEndtag | htfBlock | htfPar },
85 { "noscript", htfRequireEndtag | htfBlock | htfPar },
86 { "object", htfRequireEndtag },
87 { "ol", htfRequireEndtag | htfBlock | htfPar },
88 { "optgroup", htfRequireEndtag },
89 { "option", htfAllowEndtag | htfSoakUpText | htfForbidPre },
90 { strP, htfAllowEndtag | htfBlock | htfPar },
91 { "pre", htfRequireEndtag | htfBlock | htfPar },
92 { "q", htfRequireEndtag }, /* stolen from J.B. & the next generation :-) */
93 { "s", htfRequireEndtag }, /* deprecated */
94 { "script", htfRequireEndtag | htfSoakUpText },
95 { strSelect, htfRequireEndtag },
96 { "strike", htfRequireEndtag }, /* deprecated */
97 { "strong", htfRequireEndtag },
98 { strStyle, htfRequireEndtag | htfSoakUpText },
99 { "sub", htfRequireEndtag },
100 { strSup, htfRequireEndtag },
101 { "table", htfRequireEndtag | htfBlock | htfPar },
102 { "td", htfAllowEndtag },
103 { "textarea", htfRequireEndtag | htfSoakUpText },
104 { "th", htfAllowEndtag },
105 { strTitle, htfRequireEndtag | htfSoakUpText | htfForbidPre },
106 { "tr", htfAllowEndtag | htfPar },
107 { "u", htfRequireEndtag }, /* deprecated */
108 { "ul", htfRequireEndtag | htfBlock | htfPar }
109 };
110
111 #if OPTION_CED > 0
112 #define NUM_CHENT (141)
113 #define CED01(ced0, ced1) ced1
114 #define DECLCED1(name, ced1) { name, ced1 },
115 #define CODE01(index0, index1) index1
116 #else
117 #define NUM_CHENT (89)
118 #define CED01(ced0, ced1) ced0
119 #define DECLCED1(name, ced1)
120 #define CODE01(index0, index1) index0
121 static const char chentcode_rerouter[] = "AaCcEeIiNnOoUuYy";
122 #endif
123
124 #define MAXLEN_CHENT (6)
125 static
126 #if OPTION_CED < 2
127 const
128 #else
129 const_after_init
130 #endif
131 struct
132 { const char* name; /* (sorted in strcmp() order) */
133 const char* result;
134 } chent[NUM_CHENT] =
135 { DECLCED1("AElig", "�") /*0*/
136 DECLCED1("Aacute", "�")
137 DECLCED1("Acirc", "�")
138 DECLCED1("Agrave", "�")
139 DECLCED1("Aring", "�")
140 DECLCED1("Atilde", "�") /*5*/
141 { "Auml", CED01("Ae", "�") },
142 DECLCED1("Ccedil", "�")
143 { "Dagger", strHm },
144 { "ETH", CED01("DH", "�") },
145 DECLCED1("Eacute", "�") /*10*/
146 DECLCED1("Ecirc", "�")
147 DECLCED1("Egrave", "�")
148 DECLCED1("Euml", "�")
149 DECLCED1("Iacute", "�")
150 DECLCED1("Icirc", "�") /*15*/
151 DECLCED1("Igrave", "�")
152 DECLCED1("Iuml", "�")
153 DECLCED1("Ntilde", "�")
154 DECLCED1("Oacute", "�")
155 DECLCED1("Ocirc", "�") /*20*/
156 DECLCED1("Ograve", "�")
157 DECLCED1("Oslash", "�")
158 DECLCED1("Otilde", "�")
159 { "Ouml", CED01("Oe", "�") },
160 { "Prime", "''" }, /*25*/
161 { "THORN", CED01("P", "�") },
162 DECLCED1("Uacute", "�")
163 DECLCED1("Ucirc", "�")
164 DECLCED1("Ugrave", "�")
165 { "Uuml", CED01("Ue", "�") }, /*30*/
166 DECLCED1("Yacute", "�")
167 DECLCED1("aacute", "�")
168 DECLCED1("acirc", "�")
169 { strAcute, CED01(strSingleQuote, "�") },
170 DECLCED1("aelig", "�") /*35*/
171 DECLCED1("agrave", "�")
172 { "amp", "&" },
173 { "apos", strSingleQuote },
174 DECLCED1("aring", "�")
175 DECLCED1("atilde", "�") /*40*/
176 { "auml", CED01("ae", "�") },
177 { "bdquo", strDoubleQuote },
178 DECLCED1("beta", "�")
179 { "boxv", strPipe },
180 { "brkbar", strPipe }, /*45*/
181 { "brvbar", strPipe },
182 { "bull", "o" },
183 DECLCED1("ccedil", "�")
184 { strCedil, CED01(",", "�") },
185 { "cent", CED01("-c-", "�") }, /*50*/
186 { strCirc, "^" },
187 { "copy", CED01("(C)", "�") },
188 { "curren", CED01("CUR", "�") },
189 { "darr", "v" },
190 { "deg", CED01("DEG", "�") }, /*55*/
191 { "divide", CED01("/", "�") },
192 DECLCED1("eacute", "�")
193 DECLCED1("ecirc", "�")
194 DECLCED1("egrave", "�")
195 { "eth", CED01("dh", "�") }, /*60*/
196 DECLCED1("euml", "�")
197 { "euro", "EUR" }, /* inconsequent, but ISO */
198 { "frac12", CED01(" 1/2", "�") },
199 { "frac14", CED01(" 1/4", "�") },
200 { "frac34", CED01(" 3/4", "�") }, /*65*/
201 { "frasl", "/" },
202 { "ge", ">=" },
203 { "gt", strGt },
204 { "hArr", "<=>" },
205 { "harr", "<->" }, /*70*/
206 DECLCED1("iacute", "�")
207 DECLCED1("icirc", "�")
208 { "iexcl", CED01("!", "�") },
209 DECLCED1("igrave", "�")
210 { "iquest", CED01(strQm, "�") }, /*75*/
211 DECLCED1("iuml", "�")
212 { "lArr", "<=" },
213 { "lang", strLt },
214 { "laquo", strDoubleQuote },
215 { "larr", "<-" }, /*80*/
216 { "ldquo", strDoubleQuote },
217 { "le", "<=" },
218 { "lsaquo", strSingleQuote },
219 { "lsquo", strSingleQuote },
220 { "lt", strLt }, /*85*/
221 { "mdash", strMinus },
222 { "middot", CED01(".", "�") },
223 { "minus", strMinus },
224 { "nbsp", strSpace }, /* FIXME! */
225 { "ndash", strMinus }, /*90*/
226 { "ne", "!=" }, /* CHECKME! */
227 { "not", CED01("NOT", "�") }, /* CHECKME! */
228 DECLCED1("ntilde", "�")
229 DECLCED1("oacute", "�")
230 DECLCED1("ocirc", "�") /*95*/
231 DECLCED1("ograve", "�")
232 { "oplus", "+" },
233 { "ordf", CED01("-a", "�") },
234 { "ordm", CED01("-o", "�") },
235 DECLCED1("oslash", "�") /*100*/
236 DECLCED1("otilde", "�")
237 { strOtimes, strAsterisk }, /* CHECKME! */
238 { "ouml", CED01("oe", "�") },
239 { "para", CED01("par.", "�") }, /* CHECKME! */
240 { "permil", "o/oo" }, /*105*/
241 { strPi, strPi },
242 { "plusmn", CED01("+/-", "�") },
243 { "pound", CED01("-L-", "�") },
244 { "prime", strSingleQuote },
245 { "quot", strDoubleQuote }, /*110*/
246 { "rArr", "=>" },
247 { "rang", strGt },
248 { "raquo", strDoubleQuote },
249 { "rarr", "->" },
250 { "rdquo", strDoubleQuote }, /*115*/
251 { "reg", CED01("(R)", "�") },
252 { "rsaquo", strSingleQuote },
253 { "rsquo", strSingleQuote },
254 { "sbquo", strSingleQuote },
255 { "sdot", CED01(strAsterisk, "�") }, /*120*/ /* CHECKME! */
256 { "sect", CED01("sect.", "�") }, /* CHECKME! */
257 { "shy", strEmpty },
258 { "sim", "~" },
259 { "sup1", CED01("^1", "�") },
260 { "sup2", CED01("^2", "�") }, /*125*/
261 { "sup3", CED01("^3", "�") },
262 { "szlig", CED01("ss", "�") },
263 { "thorn", CED01(strP, "�") },
264 { strTilde, "~" },
265 { strTimes, CED01(strAsterisk, "�") }, /*130*/
266 { "trade", "(tm)" },
267 DECLCED1("uacute", "�")
268 { "uarr", "^" },
269 DECLCED1("ucirc", "�")
270 DECLCED1("ugrave", "�") /*135*/
271 { strUml, CED01(strSpace, "�") },
272 { "uuml", CED01("ue", "�") },
273 DECLCED1("yacute", "�")
274 { "yen", CED01("-Y-", "�") },
275 DECLCED1("yuml", "�")
276 };
277
278 typedef unsigned short tEntityCode;
279 #define NUM_CHENTCODE (139)
280 static const struct
281 { tEntityCode code; /* (sorted in numerical order) */
282 signed short index; /* chent[] or chentcode_rerouter[] index */
283 } chentcode[NUM_CHENTCODE] =
284 { { 34, CODE01(63, 110) },
285 { 38, CODE01(8, 37) },
286 { 39, CODE01(9, 38) },
287 { 60, CODE01(44, 85) },
288 { 62, CODE01(31, 68) },
289 { 160, CODE01(48, 89) },
290 { 161, CODE01(34, 73) },
291 { 162, CODE01(17, 50) },
292 { 163, CODE01(61, 108) },
293 { 164, CODE01(20, 53) },
294 { 165, CODE01(88, 139) },
295 { 166, CODE01(14, 46) },
296 { 167, CODE01(74, 121) },
297 { 168, CODE01(86, 136) },
298 { 169, CODE01(19, 52) },
299 { 170, CODE01(53, 98) },
300 { 171, CODE01(38, 79) },
301 { 172, CODE01(51, 92) },
302 { 173, CODE01(75, 122) },
303 { 174, CODE01(69, 116) },
304 { 176, CODE01(22, 55) },
305 { 177, CODE01(60, 107) },
306 { 178, CODE01(78, 125) },
307 { 179, CODE01(79, 126) },
308 { 180, CODE01(7, 34) },
309 { 182, CODE01(57, 104) },
310 { 183, CODE01(46, 87) },
311 { 184, CODE01(16, 49) },
312 { 185, CODE01(77, 124) },
313 { 186, CODE01(54, 99) },
314 { 187, CODE01(66, 113) },
315 { 188, CODE01(27, 64) },
316 { 189, CODE01(26, 63) },
317 { 190, CODE01(28, 65) },
318 { 191, CODE01(35, 75) },
319 { 192, CODE01(-1, 3) },
320 { 193, CODE01(-1, 1) },
321 { 194, CODE01(-1, 2) },
322 { 195, CODE01(-1, 5) },
323 { 196, CODE01(0, 6) },
324 { 197, CODE01(-1, 4) },
325 { 198, CODE01(-1, 0) },
326 { 199, CODE01(-3, 7) },
327 { 200, CODE01(-5, 12) },
328 { 201, CODE01(-5, 10) },
329 { 202, CODE01(-5, 11) },
330 { 203, CODE01(-5, 13) },
331 { 204, CODE01(-7, 16) },
332 { 205, CODE01(-7, 14) },
333 { 206, CODE01(-7, 15) },
334 { 207, CODE01(-7, 17) },
335 { 208, CODE01(2, 9) },
336 { 209, CODE01(-9, 18) },
337 { 210, CODE01(-11, 21) },
338 { 211, CODE01(-11, 19) },
339 { 212, CODE01(-11, 20) },
340 { 213, CODE01(-11, 23) },
341 { 214, CODE01(3, 24) },
342 { 215, CODE01(83, 130) },
343 { 216, CODE01(-11, 22) },
344 { 217, CODE01(-13, 29) },
345 { 218, CODE01(-13, 27) },
346 { 219, CODE01(-13, 28) },
347 { 220, CODE01(6, 30) },
348 { 221, CODE01(-15, 31) },
349 { 222, CODE01(5, 26) },
350 { 223, CODE01(80, 127) },
351 { 224, CODE01(-2, 36) },
352 { 225, CODE01(-2, 32) },
353 { 226, CODE01(-2, 33) },
354 { 227, CODE01(-2, 40) },
355 { 228, CODE01(10, 41) },
356 { 229, CODE01(-2, 39) },
357 { 230, CODE01(-2, 35) },
358 { 231, CODE01(-4, 48) },
359 { 232, CODE01(-6, 59) },
360 { 233, CODE01(-6, 57) },
361 { 234, CODE01(-6, 58) },
362 { 235, CODE01(-6, 61) },
363 { 236, CODE01(-8, 74) },
364 { 237, CODE01(-8, 71) },
365 { 238, CODE01(-8, 72) },
366 { 239, CODE01(-8, 76) },
367 { 240, CODE01(24, 60) },
368 { 241, CODE01(-10, 93) },
369 { 242, CODE01(-12, 96) },
370 { 243, CODE01(-12, 94) },
371 { 244, CODE01(-12, 95) },
372 { 245, CODE01(-12, 101) },
373 { 246, CODE01(56, 103) },
374 { 247, CODE01(23, 56) },
375 { 248, CODE01(-12, 100) },
376 { 249, CODE01(-14, 135) },
377 { 250, CODE01(-14, 132) },
378 { 251, CODE01(-14, 134) },
379 { 252, CODE01(87, 137) },
380 { 253, CODE01(-16, 138) },
381 { 254, CODE01(81, 128) },
382 { 255, CODE01(-16, 140) },
383 { 710, CODE01(18, 51) },
384 { 732, CODE01(82, 129) },
385 { 960, CODE01(59, 106) },
386 { 8211, CODE01(49, 90) },
387 { 8212, CODE01(45, 86) },
388 { 8216, CODE01(43, 84) },
389 { 8217, CODE01(71, 118) },
390 { 8218, CODE01(72, 119) },
391 { 8220, CODE01(40, 81) },
392 { 8221, CODE01(68, 115) },
393 { 8222, CODE01(11, 42) },
394 { 8225, CODE01(1, 8) },
395 { 8226, CODE01(15, 47) },
396 { 8240, CODE01(58, 105) },
397 { 8242, CODE01(62, 109) },
398 { 8243, CODE01(4, 25) },
399 { 8249, CODE01(42, 83) },
400 { 8250, CODE01(70, 117) },
401 { 8260, CODE01(29, 66) },
402 { 8364, CODE01(25, 62) },
403 { 8482, CODE01(84, 131) },
404 { 8592, CODE01(39, 80) },
405 { 8593, CODE01(85, 133) },
406 { 8594, CODE01(67, 114) },
407 { 8595, CODE01(21, 54) },
408 { 8596, CODE01(33, 70) },
409 { 8656, CODE01(36, 77) },
410 { 8658, CODE01(64, 111) },
411 { 8660, CODE01(32, 69) },
412 { 8722, CODE01(47, 88) },
413 { 8764, CODE01(76, 123) },
414 { 8800, CODE01(50, 91) },
415 { 8804, CODE01(41, 82) },
416 { 8805, CODE01(30, 67) },
417 { 8853, CODE01(52, 97) },
418 { 8855, CODE01(55, 102) },
419 { 8901, CODE01(73, 120) },
420 { 9001, CODE01(37, 78) },
421 { 9002, CODE01(65, 112) },
422 { 9474, CODE01(12, 44) }
423 };
424
425 /* character entity suffix handling */
426 enum { csiMax=13, csiDash=9, csiSpace=10 }; /* "csi": chentsuffix[] index */
427 static const char* const chentsuffix[csiMax + 1] =
428 { strAcute, strCedil, strCirc, "grave", "lig", "ring", "slash", strTilde,
429 strUml, "dash", "sp", "caron", "comma", "cy"
430 };
431 static const unsigned char chentsuffixlen[csiMax + 1] =
432 { 5, 5, 4, 5, 3, 4, 5, 5, 3, 4, 2, 5, 5, 2 };
433
434 static const struct
435 { const char* str; /* (sorted in alphabetical order) */
436 tAttributeName an;
437 } attrdata[] =
438 { { "action", anAction },
439 { "align", anAlign },
440 { strAlt, anAlt },
441 { "checked", anChecked },
442 { "class", anClass },
443 { "color", anColor },
444 { "content", anContent },
445 { "declare", anDeclare },
446 { strDisabled, anDisabled },
447 { "enctype", anEnctype },
448 { "face", anFace },
449 { strHref, anHref },
450 { "http-equiv", anHttpEquiv },
451 { "id", anId },
452 { strLabel, anLabel },
453 { "language", anLanguage },
454 { "maxlength", anMaxlength },
455 { "media", anMedia },
456 { "method", anMethod },
457 { "multiple", anMultiple },
458 { strName, anName },
459 { strReadonly, anReadonly },
460 { "selected", anSelected },
461 { strSize, anSize },
462 { "src", anSrc },
463 { strStyle, anStyle },
464 { strTitle, anTitle },
465 { strType, anType },
466 { strValue, anValue },
467 { "width", anWidth }
468 };
469
470 /* which HTML attributes need value conversion (bitfield) */
471 static const unsigned char attrvalueconv[5] = { 106, 12, 145, 251, 1 };
472 /* end-autogenerated */
473
474 enum
475 { hpsText = 0, hpsTag = 1, hpsAttrName = 2, hpsEquals = 3, hpsAttrValue = 4,
476 hpsDone = 5, hpsComment1 = 6, hpsComment2 = 7, hpsComment3 = 8
477 };
478 typedef unsigned char tHtmlParserState;
479 #define MAX_HPS (8)
480
481 #if CONFIG_DEBUG
482 static const char* const hps_name[MAX_HPS + 1] =
483 { strText, "tag", "attrname", "=", "attrvalue", "done", "comment1", "comment2",
484 "comment3"
485 };
486 #endif
487
488 static tHtmlParserState state;
489 static tAttributeName current_attr_name;
490 static tHtmlTagKind current_tagkind;
491 static char* current_unknown_tagname;
492 static tAttribute* current_attributes;
493
494 static unsigned int bufsize, maxbufsize;
495
496 static tBoolean is_endtag, tagblock_ends;
497 static unsigned char attrvalue_quotes; /* 0=none, 1=single-, 2=double-quotes */
498
499 static tCantent* current_cantent;
500 static tBoolean is_current_node_valid, is_parsing_done, inside_select;
501 static tHtmlNode *current_node, *current_node_in_tree, *previous_node_in_tree,
502 *delayed_node, *select_node;
503 static const char* dataptr;
504 static char* buf;
505 static tContentblock *current_block, *lhpp_content;
506 static size_t current_block_sizeleft, lhpp_byte;
507 static tActiveElementBase curraebase, select_aebase, *aebase;
508 static tActiveElementNumber aenum, aemax;
509
510 #define IS_WHITESPACE(ch) \
511 ( ((ch) == ' ') || ((ch) == '\t') || ((ch) == '\n') || ((ch) == '\r') )
512
buf_append(const char ch)513 static void buf_append(const char ch)
514 { if (maxbufsize <= bufsize)
515 { maxbufsize += 1024;
516 buf = memory_reallocate(buf, maxbufsize, mapString);
517 }
518 buf[bufsize++] = ch;
519 }
520
attr2htmlinputlength(const tAttribute * attr,tHtmlInputLength _default)521 static tHtmlInputLength attr2htmlinputlength(const tAttribute* attr,
522 tHtmlInputLength _default)
523 { tHtmlInputLength retval = _default;
524 if (attr != NULL)
525 { const char* value = attr->value;
526 if (value != NULL)
527 { int l;
528 my_atoi(value, &l, NULL, MAX_HTML_INPUT_LENGTH + 1);
529 if (l > MAX_HTML_INPUT_LENGTH) l = MAX_HTML_INPUT_LENGTH;
530 else if (l < 1) l = 1;
531 retval = (tHtmlInputLength) l;
532 }
533 }
534 return(retval);
535 }
536
do_lookup_tagkind(void)537 static one_caller tMbsIndex do_lookup_tagkind(void)
538 { my_binary_search(0, NUM_TAGDATA - 1, strcmp(buf, tagdata[idx].name),
539 return(idx))
540 }
541
lookup_tagkind(void)542 static one_caller tHtmlTagKind lookup_tagkind(void)
543 /* transforms a tag name string (in <buf>) to the corresponding tag kind
544 number */
545 { tMbsIndex idx = do_lookup_tagkind();
546 if (idx >= 0) return(((tHtmlTagKind) idx) + TAGOFFSET);
547 else return(htkInvalid);
548 }
549
550 enum { ceisCopy = 0, ceisKind = 1, ceisNumkind = 2, ceisInside = 3 };
551 typedef unsigned char tCharacterEntityInterpreterState; /* (-: */
552
553 enum { ekString = 0, ekDecNumber = 1, ekHexNumber = 2 };
554 typedef unsigned char tEntityKind;
555
do_lookup_entity_string(const char * str,size_t len)556 static one_caller tMbsIndex do_lookup_entity_string(const char* str,size_t len)
557 { my_binary_search(0, NUM_CHENT - 1, strncmp(str, chent[idx].name, len),
558 return(idx))
559 }
560
lookup_entity_string(const char * str,size_t len)561 static one_caller tMbsIndex lookup_entity_string(const char* str, size_t len)
562 { /* Try to find a "candidate": */
563 tMbsIndex retval = do_lookup_entity_string(str, len);
564 /* Check whether the candidate is okay: */
565 if ( (retval >= 0) && (strlen(chent[retval].name) != len) )
566 retval = INVALID_INDEX;
567 return(retval);
568 }
569
570 #define cec(code1, code2) my_numcmp(code1, code2) /* compare entity codes */
571
lookup_entity_code(tEntityCode code)572 static one_caller tMbsIndex lookup_entity_code(tEntityCode code)
573 { my_binary_search(0, NUM_CHENTCODE - 1, cec(code, chentcode[idx].code),
574 return(idx))
575 }
576
guessed_entity(const char * str,size_t len,char ** _dest)577 static one_caller tBoolean guessed_entity(const char* str, size_t len,
578 char** _dest)
579 /* tries to "guess" the meaning of an unknown character entity by looking at
580 its prefix or suffix, tries to make the best out of that, and returns
581 whether all that worked */
582 { if (len < 4) goto out; /* can't do anything here */
583 if ( (!strncmp(str, strSup, 3)) && /* entity has prefix "sup" */
584 (! ( (len == 4) && (str[3] == 'e') ) ) ) /* and isn't "⊇" */
585 { char* dest;
586 str += 3; len -= 3;
587 copy:
588 dest = *_dest;
589 while (len-- > 0) *dest++ = *str++;
590 *_dest = dest;
591 return(truE);
592 }
593 else /* check for suffices */
594 { const char* end = str + len;
595 unsigned short idx;
596 for (idx = 0; idx <= csiMax; idx++)
597 { size_t sufflen = (size_t) chentsuffixlen[idx];
598 if (len > sufflen)
599 { const char* suff = chentsuffix[idx];
600 if (!strncmp(end - sufflen, suff, sufflen))
601 { if (idx == csiDash) { str = strMinus; len = 1; }
602 else if (idx == csiSpace) { str = strSpace; len = 1; }
603 else len -= sufflen;
604 goto copy;
605 }
606 }
607 }
608 }
609 out:
610 return(falsE); /* didn't find anything */
611 }
612
shall_interpret_chents_in_attrvalue(void)613 static one_caller tBoolean shall_interpret_chents_in_attrvalue(void)
614 { tBoolean retval = cond2boolean( (current_attr_name < NUM_ATTRNAMES) &&
615 (my_bit_test(attrvalueconv, current_attr_name)) );
616 if (retval)
617 { /* Have to handle some special cases. Let's thank the htmlspec writers for
618 this bogosity... */
619 if ( ( (current_tagkind == htkMeta) && (current_attr_name == anName) )
620 #if 0
621 /* These aren't yet implemented. */
622 || ( (current_tagkind == htkLi) && (current_attr_name == anValue) )
623 || ( (current_tagkind == htkSelect) && (current_attr_name == anSize) )
624 #endif
625 )
626 retval = falsE;
627 }
628 return(retval);
629 }
630
interpret_character_entities(char * origdest,const char * src,tBoolean may_trim)631 static one_caller void interpret_character_entities(char* origdest,
632 const char* src, tBoolean may_trim)
633 { const char *start SHUT_UP_COMPILER(NULL), *start0 SHUT_UP_COMPILER(NULL);
634 char* dest = origdest;
635 tCharacterEntityInterpreterState ceis = ceisCopy;
636 tEntityKind kind SHUT_UP_COMPILER(ekString);
637 unsigned char lenleft SHUT_UP_COMPILER(0);
638 if (may_trim) { while (IS_WHITESPACE(*src)) src++; }
639 while (1)
640 { char ch = *src;
641 unsigned int _code;
642 switch (ceis)
643 {case ceisCopy: /* the most likely case */
644 if (ch == '&') { start0 = src; ceis = ceisKind; }
645 else { *dest++ = ch; if (ch == '\0') goto out; }
646 break;
647 case ceisKind: /* find out the "kind" of the entity */
648 if (ch == '#') ceis = ceisNumkind; /* it's some numeric kind */
649 else /* it's a string */
650 { kind = ekString; start = src; lenleft = MAXLEN_CHENT + 1;
651 ceis = ceisInside; goto inside;
652 }
653 break;
654 case ceisNumkind: /* find out whether it's decimal or hex */
655 lenleft = 4 + 1; ceis = ceisInside;
656 if ( (ch == 'x') || (ch == 'X') ) { kind = ekHexNumber; start = src + 1;}
657 else { kind = ekDecNumber; start = src; goto inside; }
658 break;
659 case ceisInside: /* "inside" the entity */
660 inside:
661 lenleft--;
662 if ( (ch == '\0') || (ch == ';') || (ch == ' ') || (ch == '&') ||
663 (!lenleft) )
664 { /* found an end-point */
665 if (src <= start + 1) /* can't have found anything useful - CHECKME! */
666 { postcopy:
667 while (start0 <= src) *dest++ = *start0++;
668 if (ch == '\0') goto out;
669 }
670 else if (kind == ekString)
671 { size_t len = src - start;
672 tMbsIndex idx = lookup_entity_string(start, len);
673 if (idx >= 0) /* found in list */
674 { const char* temp = chent[idx].result;
675 while (*temp) *dest++ = *temp++;
676 }
677 else if (!guessed_entity(start, len, &dest))
678 goto postcopy; /* no idea */
679 }
680 else if (kind == ekDecNumber)
681 { const char* temp = start;
682 tMbsIndex idx;
683 _code = 0;
684 while (temp < src)
685 { char c = *temp++;
686 if (my_isdigit(c)) _code = 10 * _code + (c - '0');
687 else goto postcopy; /* not a decimal number */
688 }
689 handle_code:
690 if (_code > chentcode[NUM_CHENTCODE - 1].code) goto postcopy;
691 idx = lookup_entity_code((tEntityCode) _code);
692 if (idx < 0)
693 { if ( (_code >= 32) &&
694 #if OPTION_CED == 0
695 (_code < 127)
696 #else
697 (_code <= 255) && (_code != 127)
698 #endif
699 )
700 { *dest++ = (char) _code; } /* interpreted as ASCII code */
701 else goto postcopy;
702 }
703 else
704 { const signed short i = chentcode[idx].index;
705 #if OPTION_CED == 0
706 if (i < 0) *dest++ = chentcode_rerouter[-1 - i];
707 else
708 #endif
709 { temp = chent[i].result;
710 while (*temp) *dest++ = *temp++;
711 }
712 }
713 }
714 else if (kind == ekHexNumber)
715 { const char* temp = start;
716 _code = 0;
717 while (temp < src)
718 { char c = *temp++;
719 unsigned int add;
720 if (my_isdigit(c))
721 { add = c - '0';
722 calc:
723 _code = 16 * _code + add;
724 }
725 else if ( (c >= 'a') && (c <= 'f') )
726 { add = c - 'a' + 10; goto calc; }
727 else if ( (c >= 'A') && (c <= 'F') )
728 { add = c - 'A' + 10; goto calc; }
729 else goto postcopy; /* not a hexadecimal number */
730 }
731 goto handle_code;
732 }
733 ceis = ceisCopy;
734 }
735 break;
736 }
737 if (ch == '\0') { *dest = '\0'; goto out; }
738 else src++;
739 }
740 out:
741 if (may_trim) /* remove trailing whitespace */
742 { dest = origdest + strlen(origdest) - 1; /* IMPROVEME? */
743 while (dest >= origdest)
744 { const char c = *dest;
745 if (!IS_WHITESPACE(c)) break;
746 *dest-- = '\0';
747 }
748 }
749 }
750
find_and_detach_attribute(tAttribute ** list,tAttributeName name)751 static tAttribute* find_and_detach_attribute(tAttribute** list,
752 tAttributeName name)
753 /* searches and extracts an attribute of the given <name> from the <list> */
754 { tAttribute *a = *list, *b;
755 if (a == NULL) return(NULL);
756 if (a->name == name) { *list = a->next; a->next = NULL; return(a); }
757 while ( (b = a->next) != NULL )
758 { if (b->name == name) { a->next = b->next; b->next = NULL; return(b); }
759 a = b;
760 }
761 return(NULL);
762 }
763
_find_and_detach_attribute(void ** list,tAttributeName name)764 static __my_inline tAttribute* _find_and_detach_attribute(void** list,
765 tAttributeName name)
766 { return(find_and_detach_attribute((tAttribute**) list, name));
767 /* nasty casting rubbish */
768 }
769
770 #define __fada(list, name) find_and_detach_attribute(&list, name)
771 #define fada(name) __fada(current_attributes, name)
772
773 #if CONFIG_JAVASCRIPT
fada_js(void ** __list)774 static one_caller tAttribute* fada_js(void** __list)
775 /* like find_and_detach_attribute(), but for Javascript-related attributes */
776 { tAttribute **_list = (tAttribute**) __list; /* nasty casting rubbish */
777 tAttribute *retval, *list = *_list;
778 if (list == NULL) retval = NULL;
779 else if (is_an_for_javascript(list->name))
780 { retval = list; *_list = retval->next; retval->next = NULL; }
781 else
782 { tAttribute *a = list, *next;
783 while ( (next = a->next) != NULL )
784 { if (is_an_for_javascript(next->name))
785 { a->next = next->next; retval = next; retval->next = NULL; goto out; }
786 a = next;
787 }
788 retval = NULL;
789 }
790 out:
791 return(retval);
792 }
793 #endif
794
795 /* prepare curraebase */
796 #define set_caeb(_kind) \
797 do { my_memclr_var(curraebase); curraebase.kind = _kind; } while (0)
798
799 /* move attribute value */
800 #define __moav(dest, attr) \
801 do { curraebase.dest = attr->value; attr->value = NULL; } while (0)
802 #define moavd(attr) __moav(data, attr)
803 #define moavr(attr) __moav(render, attr)
804
805 /* deallocate an attribute and all associated data */
806 #define __deattr(a) do { __dealloc(a->value); memory_deallocate(a); } while (0)
807 #define deattr(a) do { if (a != NULL) __deattr(a); } while (0)
808
809 #define NUM_INPUT_TYPE (10)
810 static const struct
811 { const char* name; /* (sorted in alphabetical order) */
812 tActiveElementKind kind;
813 } input_type[NUM_INPUT_TYPE] =
814 { { strButton, aekFormButton },
815 { strCheckbox, aekFormCheckbox },
816 { strFile, aekFormFile },
817 { "hidden", aekFormHidden },
818 { strImage, aekFormImage },
819 { "password", aekFormPassword },
820 { "radio", aekFormRadio },
821 { strReset, aekFormReset },
822 { strSubmit, aekFormSubmit },
823 { strText, aekFormText }
824 };
825
do_lookup_input_type(const char * str)826 static one_caller tMbsIndex do_lookup_input_type(const char* str)
827 { my_binary_search(0, NUM_INPUT_TYPE - 1, streqcase3(str,
828 input_type[idx].name), return(idx))
829 }
830
lookup_input_type(const tAttribute * attr)831 static one_caller tActiveElementKind lookup_input_type(const tAttribute* attr)
832 { tActiveElementKind retval = aekFormText; /* htmlspec default */
833 if (attr != NULL)
834 { const char* av = attr->value;
835 if ( (av != NULL) && (*av != '\0') ) /* non-empty attribute value */
836 { tMbsIndex idx = do_lookup_input_type(av);
837 if (idx >= 0) retval = input_type[idx].kind;
838 else retval = aekUnknown;
839 }
840 }
841 return(retval);
842 }
843
844 #if CONFIG_JAVASCRIPT
lookup_javascript_event(void)845 static one_caller tMbsIndex lookup_javascript_event(void)
846 { const char* str;
847 size_t len = bufsize - 1;
848 if ( (len < JAVASCRIPT_MIN_EVENT_NAME_LENGTH) ||
849 (my_tolower(buf[0]) != 'o') || (my_tolower(buf[1]) != 'n') )
850 return(INVALID_INDEX);
851 str = buf + 2;
852 my_binary_search(0, JAVASCRIPT_MAX_EVENT_CODE, strcmp(str, strJek[idx]),
853 return(idx))
854 }
855 #endif
856
use_curraebase(void)857 static one_caller tBoolean use_curraebase(void)
858 /* checks whether an active-element base should be created for the current node
859 and prepares that if so */
860 { tBoolean retval = falsE; /* the most likely result */
861 switch (current_tagkind)
862 { case htkA: case htkArea:
863 { tAttribute* h = fada(anHref);
864 if ( (h != NULL) && (h->value != NULL) && (h->value[0] != '\0') )
865 { set_caeb(aekLink); moavd(h); retval = truE; }
866 deattr(h);
867 }
868 break;
869 case htkFrame: case htkIframe:
870 { tAttribute *s = fada(anSrc), *t = fada(anTitle);
871 if ( (s != NULL) && (s->value != NULL) && (s->value[0] != '\0') )
872 { char* tv;
873 set_caeb(aekLink); moavd(s); retval = truE;
874 if ( (t != NULL) && ( (tv = t->value) != NULL ) && (*tv != '\0') )
875 t->value = NULL; /* detach */
876 else tv = my_strdup(_("[a frame]"));
877 curraebase.render = tv;
878 }
879 deattr(s); deattr(t);
880 }
881 break;
882 case htkInput:
883 { tAttribute *t = fada(anType), *n = fada(anName), *v = fada(anValue),
884 *s = fada(anSize), *m = fada(anMaxlength), *a = fada(anAlt),
885 *ch = fada(anChecked), *di = fada(anDisabled), *re = fada(anReadonly);
886 tActiveElementKind kind = lookup_input_type(t);
887 if (kind != aekUnknown)
888 { tActiveElementFlags flags = aefNone;
889 const char* render;
890 set_caeb(kind);
891 if ( (n != NULL) && (n->value != NULL) && (n->value[0] != '\0') )
892 moavd(n);
893 switch (kind)
894 { case aekFormSubmit:
895 render = _("Submit");
896 handle_render:
897 if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
898 { render = v->value; v->value = NULL; } /* explicit value given */
899 else render = my_strdup(render);
900 curraebase.render = render; break;
901 case aekFormReset:
902 render = _("Reset"); goto handle_render; /*@notreached@*/ break;
903 case aekFormButton:
904 render = _("[a push button]"); goto handle_render; /*@notreached@*/
905 break;
906 case aekFormImage:
907 if ( (a != NULL) && (a->value != NULL) && (a->value[0] != '\0') )
908 { curraebase.render = a->value; a->value = NULL; }
909 else curraebase.render = my_strdup(_("[a form image]"));
910 break;
911 case aekFormText: case aekFormPassword: case aekFormRadio:
912 case aekFormHidden: /* case aekFormFile: */
913 /* Privacy Note: for aekFormFile, we don't store the default value
914 because storing it might lead to an unwanted transmission of
915 local file contents (if the user submits the form without
916 recognizing that there is an aekFormFile element in it). See e.g.
917 RFC1867, 8. */
918 if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
919 moavr(v); /* default text */
920 break;
921 }
922
923 if (has_input_length(kind))
924 { tHtmlInputLength max = attr2htmlinputlength(m,MAX_HTML_INPUT_LENGTH),
925 size = attr2htmlinputlength(s, 20);
926 if (size > max) size = max;
927 curraebase.size = size; curraebase.maxlength = max;
928 }
929
930 if (ch != NULL) flags |= aefCheckedSelected;
931 if (di != NULL) flags |= aefDisabled;
932 if (re != NULL) flags |= aefReadonly;
933 curraebase.flags = flags; retval = truE;
934 }
935 deattr(t); deattr(n); deattr(v); deattr(s); deattr(m); deattr(a);
936 deattr(ch); deattr(di); deattr(re);
937 }
938 break;
939 case htkTextarea:
940 { tAttribute *n = fada(anName), *di = fada(anDisabled),
941 *re = fada(anReadonly);
942 tActiveElementFlags flags = aefNone;
943 set_caeb(aekFormText);
944 if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
945 if (di != NULL) flags |= aefDisabled;
946 if (re != NULL) flags |= aefReadonly;
947 curraebase.flags = flags; curraebase.size = 20; retval = truE;
948 deattr(n); deattr(di); deattr(re);
949 }
950 break;
951 case htkButton:
952 { tAttribute *t = fada(anType), *n = fada(anName), *v = fada(anValue),
953 *di = fada(anDisabled);
954 tActiveElementKind kind = aekFormSubmit; /* htmlspec default */
955 const char* temp;
956 if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
957 if ( (t != NULL) && ( (temp = t->value) != NULL ) )
958 { if (streqcase(temp, strButton)) kind = aekFormButton;
959 else if (streqcase(temp, strReset)) kind = aekFormReset;
960 else if (!streqcase(temp, strSubmit)) kind = aekUnknown;
961 }
962 if (kind != aekUnknown)
963 { tActiveElementFlags flags = aefButtonTag;
964 if (di != NULL) flags |= aefDisabled;
965 set_caeb(kind);
966 if (v != NULL) moavr(v);
967 curraebase.flags = flags; retval = truE;
968 }
969 deattr(t); deattr(n); deattr(v); deattr(di);
970 }
971 break;
972 case htkSelect:
973 { tAttribute *n = fada(anName), *mu = fada(anMultiple),
974 *di = fada(anDisabled);
975 tActiveElementFlags flags = aefNone;
976 set_caeb(aekFormSelect);
977 if ((n != NULL) && (n->value != NULL) && (n->value[0] != '\0')) moavd(n);
978 if (mu != NULL) flags |= aefMultiple;
979 if (di != NULL) flags |= aefDisabled;
980 curraebase.flags = flags;
981 select_aebase = curraebase; inside_select = truE;
982 /* retval = falsE; -- yes, not "truE" here; this one is special... */
983 deattr(n); deattr(mu); deattr(di);
984 }
985 break;
986 }
987 return(retval);
988 }
989
990 static my_inline __sallocator tAttribute* __callocator
create_attribute(const tAttributeName name)991 create_attribute(const tAttributeName name)
992 { tAttribute* retval = (tAttribute*) memory_allocate(sizeof(tAttribute),
993 mapOther);
994 retval->name = name;
995 return(retval);
996 }
997
deallocate_attributes(const tAttribute * a)998 static my_inline void deallocate_attributes(const tAttribute* a)
999 /* deallocates the given attribute list */
1000 { while (a != NULL)
1001 { const tAttribute* next = a->next;
1002 __deattr(a); a = next;
1003 }
1004 }
1005
deallocate_html_node(const tHtmlNode * node)1006 void deallocate_html_node(const tHtmlNode* node)
1007 { if (node->kind == htkText) __dealloc((const char*) (node->data));
1008 else deallocate_attributes((const tAttribute*) (node->data));
1009 memory_deallocate(node);
1010 }
1011
deallocate_one_aebase(const tActiveElementBase * aeb)1012 void deallocate_one_aebase(const tActiveElementBase* aeb)
1013 { __dealloc(aeb->data);
1014 if (aeb->kind != aekFormSelect) __dealloc(aeb->render); /* the simple case */
1015 else
1016 { const tHtmlOption* o = (const tHtmlOption*) aeb->render;
1017 while (o != NULL)
1018 { const tHtmlOption* next = o->next;
1019 __dealloc(o->value); __dealloc(o->render); memory_deallocate(o);
1020 o = next;
1021 }
1022 }
1023 #if CONFIG_JAVASCRIPT
1024 javascript_remove_ehs(aeb->eh);
1025 #endif
1026 }
1027
htk_has_flag(const tHtmlTagKind kind,const tHtmlTagFlags flag)1028 static my_inline tBoolean htk_has_flag(const tHtmlTagKind kind,
1029 const tHtmlTagFlags flag)
1030 { return(cond2boolean( (kind >= TAGOFFSET) && (kind < TAGOFFSET + NUM_TAGDATA)
1031 && (tagdata[kind - TAGOFFSET].flags & flag) ));
1032 }
1033
htk_soaks_up_text(const tHtmlTagKind kind)1034 static __my_inline tBoolean htk_soaks_up_text(const tHtmlTagKind kind)
1035 { return(htk_has_flag(kind, htfSoakUpText));
1036 }
1037
htk_forbids_endtag(const tHtmlTagKind kind)1038 __my_inline tBoolean htk_forbids_endtag(const tHtmlTagKind kind)
1039 { return(htk_has_flag(kind, htfForbidEndtag));
1040 }
1041
htk_forbids_pre(const tHtmlTagKind kind)1042 __my_inline tBoolean htk_forbids_pre(const tHtmlTagKind kind)
1043 { return(htk_has_flag(kind, htfForbidPre));
1044 }
1045
htk_is_block(const tHtmlTagKind kind)1046 __my_inline tBoolean htk_is_block(const tHtmlTagKind kind)
1047 { return(htk_has_flag(kind, htfBlock));
1048 }
1049
htk_is_par(const tHtmlTagKind kind)1050 __my_inline tBoolean htk_is_par(const tHtmlTagKind kind)
1051 { return(htk_has_flag(kind, htfPar));
1052 }
1053
create_html_form(const char * action,tHtmlFormFlags flags)1054 static one_caller void create_html_form(const char* action,
1055 tHtmlFormFlags flags)
1056 { tHtmlFormNumber num = current_cantent->hfnum, max = current_cantent->hfmax;
1057 tHtmlForm* f;
1058 if (num >= max)
1059 { max += ( (max >= 9) ? 10 : 3 ); current_cantent->hfmax = max;
1060 current_cantent->form = memory_reallocate(current_cantent->form,
1061 max * sizeof(tHtmlForm), mapOther);
1062 }
1063 f = &(current_cantent->form[num]); f->action_uri = action; f->flags = flags;
1064 f->first_ae = f->last_ae = INVALID_AE; current_cantent->hfnum = num + 1;
1065 #if CONFIG_DEBUG
1066 sprint_safe(debugstrbuf,
1067 "create_html_form(): num=%d, max=%d, action=*%s*, flags=%d\n",
1068 num, max, action, flags);
1069 debugmsg(debugstrbuf);
1070 #endif
1071 }
1072
append_attribute_name(const tAttributeName name)1073 static void append_attribute_name(const tAttributeName name)
1074 /* appends an attribute of the given <name> to current_attributes (avoiding
1075 duplicates) */
1076 { tAttribute* a = fada(name);
1077 if (a != NULL) dealloc(a->value); /* "forget" old value */
1078 else a = create_attribute(name);
1079 a->next = current_attributes;
1080 current_attributes = a;
1081 }
1082
set_current_node(tHtmlNode * node)1083 static __my_inline void set_current_node(tHtmlNode* node)
1084 { current_node = node;
1085 is_current_node_valid = truE;
1086 }
1087
store_html_node(tHtmlNode * node,tBoolean do_skip_char)1088 static void store_html_node(tHtmlNode* node, tBoolean do_skip_char)
1089 /* stores the <node> in the tree, updates the lhpp_.... information and creates
1090 an active-element base if appropriate */
1091 { const tHtmlTagKind htk = node->kind;
1092 if (inside_select) /* don't store anything */
1093 { deallocate_html_node(node); return; }
1094
1095 if (previous_node_in_tree != NULL)
1096 { previous_node_in_tree->next = node; previous_node_in_tree = node; }
1097 else
1098 { /* The <node> is the first one for the tree of the current resource: */
1099 current_cantent->tree = previous_node_in_tree = node;
1100 }
1101 node->flags |= hnfStoredInTree;
1102 lhpp_content = current_block;
1103 if (current_block == NULL)
1104 { /* This can e.g. happen if an HTML document ends with an opening <title>
1105 tag (incomplete document or just not yet completely received); in this
1106 case we have the call chain "parser_html_next() -> change_state(hpsDone)
1107 -> finish_delayed_node() -> store_html_node()". */
1108 lhpp_byte = 0;
1109 }
1110 else
1111 { lhpp_byte = current_block->used - current_block_sizeleft +
1112 boolean2bool(do_skip_char);
1113 }
1114
1115 if (htk == htkTitle)
1116 { tAttribute* t = _find_and_detach_attribute(&(node->data), anInternalText);
1117 const char* tv;
1118 if ( (t != NULL) && ( (tv = t->value) != NULL ) && (*tv != '\0') )
1119 { __dealloc(current_cantent->major_html_title);
1120 current_cantent->major_html_title = tv; t->value = NULL;
1121 }
1122 deattr(t);
1123 }
1124
1125 if (node->flags & hnfHasAeBase)
1126 { if (aenum >= aemax) /* need to allocate more memory */
1127 { aemax += aenum_incvalue(aemax);
1128 current_cantent->aebase = aebase = memory_reallocate(aebase, aemax *
1129 sizeof(tActiveElementBase), mapOther);
1130 }
1131 #if CONFIG_JAVASCRIPT
1132 { /* extract event handlers */
1133 const tJavascriptEventHandler* javascript_ehs = NULL;
1134 const tAttribute* a;
1135 while ( (a = fada_js(&(node->data))) != NULL )
1136 { const char* v = a->value;
1137 const tJavascriptCode* code;
1138 if ( (v != NULL) && (*v != '\0') &&
1139 ( (code = javascript_compile(v)) != NULL ) )
1140 { tJavascriptEventHandler* eh =
1141 javascript_create_eh(a->name - anJavascriptBegin, code);
1142 eh->next = javascript_ehs; javascript_ehs = eh;
1143 }
1144 }
1145 curraebase.eh = javascript_ehs;
1146 }
1147 #endif
1148 if (htk == htkTextarea)
1149 { tAttribute* t = _find_and_detach_attribute(&(node->data),anInternalText);
1150 if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1151 moavr(t);
1152 deattr(t);
1153 }
1154 aebase[aenum++] = curraebase;
1155 }
1156 set_current_node(node);
1157 #if CONFIG_EXDEBUG
1158 sprint_safe(debugstrbuf, "stored node %p, %d, %d\n", node, htk, node->flags);
1159 prsdbg(debugstrbuf);
1160 #endif
1161 }
1162
prepare_text(tBoolean may_trim)1163 static __sallocator char* __callocator prepare_text(tBoolean may_trim)
1164 { char* text = (char*) __memory_allocate(bufsize, mapString);
1165 interpret_character_entities(text, buf, may_trim);
1166 return(text);
1167 }
1168
finish_delayed_node(tBoolean got_text,tBoolean do_skip_char)1169 static one_caller void finish_delayed_node(tBoolean got_text,
1170 tBoolean do_skip_char)
1171 { tHtmlTagKind htk = delayed_node->kind;
1172 if ( (got_text) && (htk != htkScript) && (htk != htkStyle) )
1173 { /* add the text as an internal attribute */
1174 tAttribute* a = create_attribute(anInternalText);
1175 tBoolean may_trim = cond2boolean((htk == htkOption) || (htk == htkTitle));
1176 a->value = prepare_text(may_trim); a->next = delayed_node->data;
1177 delayed_node->data = a;
1178 }
1179 if ( (inside_select) && (htk == htkOption) )
1180 { /* add this option to the current <select> data */
1181 tAttribute *list = (tAttribute*) delayed_node->data,
1182 *v = __fada(list, anValue), *l = __fada(list, anLabel),
1183 *t = __fada(list, anInternalText),
1184 *se = __fada(list, anSelected), *di = __fada(list, anDisabled);
1185 tHtmlOption *option = __memory_allocate(sizeof(tHtmlOption), mapOther),
1186 *o = (tHtmlOption*) select_aebase.render;
1187 tHtmlOptionFlags hof = hofNone;
1188 char *value, *render;
1189
1190 delayed_node->data = (void*) list; deallocate_html_node(delayed_node);
1191
1192 /* What to render? */
1193 if ( (l != NULL) && (l->value != NULL) && (l->value[0] != '\0') )
1194 { render = l->value; l->value = NULL; }
1195 else if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1196 { render = my_strdup(t->value); }
1197 else render = my_strdup(_("[an option]"));
1198
1199 /* What to submit? */
1200 if ( (v != NULL) && (v->value != NULL) && (v->value[0] != '\0') )
1201 { value = v->value; v->value = NULL; }
1202 else if ( (t != NULL) && (t->value != NULL) && (t->value[0] != '\0') )
1203 { value = t->value; t->value = NULL; }
1204 else value = NULL;
1205
1206 if (se != NULL) hof |= hofSelected;
1207 if (di != NULL) hof |= hofDisabled;
1208 option->next = NULL; option->value = value; option->render = render;
1209 option->flags = hof;
1210 if (o == NULL) select_aebase.render = (char*) option;
1211 else
1212 { while (o->next != NULL) o = o->next; /* IMPROVEME? */
1213 o->next = option;
1214 }
1215 select_aebase.maxlength++; /* option counter */
1216 deattr(v); deattr(l); deattr(t); deattr(se); deattr(di);
1217 }
1218 else if (!is_parsing_done) store_html_node(delayed_node, do_skip_char);
1219 else deallocate_html_node(delayed_node); /* just forget it */
1220 delayed_node = NULL; /* done with this one */
1221 }
1222
deallocate_current_attributes(void)1223 static my_inline void deallocate_current_attributes(void)
1224 { if (current_attributes != NULL)
1225 { deallocate_attributes(current_attributes); current_attributes = NULL; }
1226 }
1227
handle_meta_tag(void)1228 static one_caller void handle_meta_tag(void)
1229 { const tAttribute *n1 = fada(anName), *n2 = fada(anHttpEquiv), *n,
1230 *c = fada(anContent);
1231 const char* cv;
1232 if ( (c == NULL) || ( (cv = c->value) == NULL ) ) goto out; /* no content */
1233 n = ( (n1 != NULL) ? n1 : n2 );
1234 if ( (n == NULL) || (n->value == NULL) ) goto out; /* no name */
1235 if (!streqcase(n->value, "refresh")) goto out; /* unknown name */
1236
1237 if (current_cantent->redirection != NULL)
1238 { /* don't override HTTP redirection - the dynamic server usually knows
1239 better than the static HTML document does */
1240 goto out;
1241 }
1242 /* The refresh content isn't "really" standardized, so let's be careful: */
1243 while (IS_WHITESPACE(*cv)) cv++;
1244 if (my_isdigit(*cv))
1245 { /* We skip and ignore the number of seconds. It is often 0 anyway, and if
1246 it _isn't_ 0 it often _should_ be 0 in order not to waste users' time,
1247 and users can get back to the original document easily with a single
1248 keyboard command at any later time. So we redirect immediately... */
1249 int dummy;
1250 my_atoi(cv, &dummy, &cv, 9);
1251 while (IS_WHITESPACE(*cv)) cv++;
1252 }
1253 if ( (*cv == ',') || (*cv == ';') ) cv++;
1254 while (IS_WHITESPACE(*cv)) cv++;
1255 if (strneqcase(cv, "url=", 4)) { cv += 4; while (IS_WHITESPACE(*cv)) cv++; }
1256 if (*cv != '\0')
1257 { current_cantent->redirection = my_strdup(cv);
1258 current_cantent->caf |= cafHtmlRedirection;
1259 #if CONFIG_DEBUG
1260 { char* spfbuf;
1261 my_spf(debugstrbuf, STRBUF_SIZE, &spfbuf, "HTML redirection: *%s*\n",cv);
1262 debugmsg(spfbuf); my_spf_cleanup(debugstrbuf, spfbuf);
1263 }
1264 #endif
1265 }
1266 out:
1267 deattr(n1); deattr(n2); deattr(c);
1268 }
1269
build_html_node(void)1270 static void build_html_node(void)
1271 { tHtmlNode* node = memory_allocate(sizeof(tHtmlNode), mapHtmlNode);
1272 const tBoolean is_textual = cond2boolean(current_tagkind == htkText),
1273 ucb = ( (is_endtag || is_textual || is_parsing_done) ? falsE
1274 : use_curraebase() );
1275 tHtmlNodeFlags hnf = hnfNone;
1276 if (is_endtag) hnf |= hnfIsEndtag;
1277 if (tagblock_ends) hnf |= hnfTagblockEnds;
1278
1279 /* Build the node: */
1280 if (is_textual) node->data = prepare_text(falsE);
1281 else
1282 { if (current_tagkind == htkInvalid)
1283 { append_attribute_name(anInternalTagname);
1284 current_attributes->value = current_unknown_tagname;
1285 /* CHECKME: that's unclean! */
1286 current_unknown_tagname = NULL;
1287 }
1288 else if ( (!is_endtag) && (!is_parsing_done) )
1289 { if (current_tagkind == htkCenter) hnf |= hnfAlignCenter;
1290 else if (current_tagkind == htkMeta) handle_meta_tag();
1291 else if (current_tagkind == htkForm)
1292 { tAttribute *a = fada(anAction), *m = fada(anMethod),
1293 *e = fada(anEnctype);
1294 if ( (a != NULL) && (a->value != NULL) && (a->value[0] != '\0') )
1295 { /* seems we have an "action" attribute, let's create a form */
1296 tHtmlFormFlags ff = hffNone;
1297 hnf |= hnfGoodForm;
1298 if ((m != NULL) && (m->value != NULL) && streqcase(m->value, "post"))
1299 ff |= hffMethodPost;
1300 if ( (e != NULL) && (e->value != NULL) &&
1301 (streqcase(e->value, "multipart/form-data")) )
1302 ff |= hffEncodingMultipart;
1303 create_html_form(a->value, ff);
1304 a->value = NULL;
1305 }
1306 deattr(a); deattr(m); deattr(e);
1307 }
1308 }
1309
1310 /* convert some general attributes to a more efficient representation */
1311 { const tAttribute* a = fada(anAlign);
1312 const char* av;
1313 if ( (a == NULL) /* most likely */ || (current_tagkind == htkTable) ||
1314 (current_tagkind == htkTh) || (current_tagkind == htkTd) )
1315 goto dont_align; /* CHECKME! */
1316 if ( ( (av = a->value) != NULL ) && (*av != '\0') )
1317 { if (streqcase(av, "left")) hnf |= hnfAlignLeft;
1318 else if (streqcase(av, strCenter)) hnf |= hnfAlignCenter;
1319 else if (streqcase(av, "right")) hnf |= hnfAlignRight;
1320 }
1321 dont_align: {}
1322 deattr(a);
1323 }
1324 node->data = current_attributes; current_attributes = NULL; /* detach */
1325 }
1326 node->kind = current_tagkind; node->flags = hnf; node->next = NULL;
1327
1328 /* Now decide what to do with the node: */
1329 if ( (!is_endtag) && (htk_soaks_up_text(current_tagkind)) )
1330 { if (current_tagkind == htkTextarea) node->flags |= hnfHasAeBase;
1331 delayed_node = node; /* We wanna soak up any immediately following text. */
1332 }
1333 else if (current_tagkind == htkSelect)
1334 { if (!is_endtag) select_node = node;
1335 else if (inside_select) /* finish and store the <select> info */
1336 { tHtmlOption *o, *o0;
1337 tBoolean found_selected_option = falsE,
1338 is_multiple = cond2boolean(select_aebase.flags & aefMultiple);
1339 inside_select = falsE; curraebase = select_aebase;
1340 select_node->flags |= hnfHasAeBase;
1341 /* Make sure one option is selected (and _only_ one if non-multiple) */
1342 o = o0 = (tHtmlOption*) curraebase.render;
1343 while (o != NULL)
1344 { if (o->flags & hofSelected)
1345 { if (found_selected_option)
1346 { /* more than one option selected in a non-multiple <select> */
1347 o->flags &= ~hofSelected;
1348 }
1349 else
1350 { found_selected_option = truE;
1351 if (is_multiple) break; /* don't care about further selections */
1352 }
1353 }
1354 o = o->next;
1355 }
1356 if ( (!found_selected_option) && (o0 != NULL) ) o0->flags |= hofSelected;
1357 /* Store the <select> node */
1358 store_html_node(select_node, truE); select_node = NULL;
1359 }
1360 }
1361 else if (!is_parsing_done) /* the most likely case */
1362 { tBoolean do_skip_char = cond2boolean(!is_textual);
1363 if (ucb) node->flags |= hnfHasAeBase;
1364 store_html_node(node, do_skip_char);
1365 }
1366 else if (is_textual)
1367 { /* Just imagine the HTML document currently ends with a longish text - the
1368 user should see it while receiving. Or imagine that a web page author
1369 simply forgot something like "</body></html>" after the last text run in
1370 a document... */
1371 set_current_node(node);
1372 }
1373 else deallocate_html_node(node); /* just forget it */
1374 deallocate_current_attributes();
1375 }
1376
do_lookup_attrname(void)1377 static one_caller tMbsIndex do_lookup_attrname(void)
1378 { my_binary_search(0, ARRAY_ELEMNUM(attrdata) - 1,
1379 strcmp(buf, attrdata[idx].str), return(idx))
1380 }
1381
lookup_attrname(void)1382 static one_caller tAttributeName lookup_attrname(void)
1383 { tMbsIndex idx = do_lookup_attrname();
1384 return( (idx < 0) ? anUnknown : attrdata[idx].an );
1385 }
1386
1387 #define aai(cond) if (cond) current_attr_name = an /* "accept <an> if" */
1388
change_state(tHtmlParserState new_state)1389 static void change_state(tHtmlParserState new_state)
1390 { tAttributeName an;
1391 buf_append('\0'); /* (for simplicity) */
1392 if (new_state == hpsDone) is_parsing_done = truE;
1393
1394 switch (state)
1395 {case hpsText: /* reached the end of a text run */
1396 if (delayed_node != NULL)
1397 finish_delayed_node(cond2boolean(bufsize > 1), falsE);
1398 else if (bufsize > 1) /* got some text for a "normal" text node */
1399 { current_tagkind = htkText; build_html_node(); }
1400 break;
1401 case hpsTag:
1402 /* We store only tags we can make sense of. (I.e. we'll store htkInvalid
1403 tags only if they have some of the "general" attributes.) */
1404 current_tagkind = htkInvalid; /* default */
1405 if (bufsize > 1)
1406 { if (!strcmp(buf, strCommentTag)) new_state = hpsComment1;
1407 else
1408 { current_tagkind = lookup_tagkind(); __dealloc(current_unknown_tagname);
1409 current_unknown_tagname = ( (current_tagkind != htkInvalid) ? NULL :
1410 my_strdup(buf) );
1411 }
1412 }
1413 break;
1414 case hpsAttrName:
1415 /* We store only attributes we can make sense of. */
1416 current_attr_name = anUnknown; /* default */
1417 if (is_endtag) goto an_ignore; /* no attributes allowed */
1418 an = lookup_attrname();
1419 if (an == anUnknown) goto an_ignore;
1420
1421 /* general attributes (allowed for almost all tags) */
1422 aai( (an == anAlign) || (an == anId) );
1423 #if CONFIG_CSS
1424 else aai( (an == anClass) || (an == anStyle) );
1425 #endif
1426 #if CONFIG_JAVASCRIPT
1427 else
1428 { tMbsIndex idx = lookup_javascript_event();
1429 if (idx >= 0)
1430 current_attr_name = ((tAttributeName) idx) + anJavascriptBegin;
1431 }
1432 #endif
1433 if (current_attr_name != anUnknown) goto an_append;
1434
1435 /* specific attributes */
1436 if (current_tagkind == htkInvalid) goto an_ignore;
1437 switch (current_tagkind)
1438 {case htkA: aai( (an == anHref) || (an == anName) ); break;
1439 case htkArea: aai( (an == anHref) || (an == anAlt) ); break;
1440 case htkButton:
1441 aai( (an == anType) || (an == anName) || (an == anValue) ||
1442 (an == anDisabled) );
1443 break;
1444 case htkForm:
1445 aai( (an == anAction) || (an == anMethod) || (an == anEnctype) ); break;
1446 case htkFrame: case htkIframe:
1447 aai( (an == anSrc) || (an == anTitle) ); break;
1448 case htkHr: aai(an == anWidth); break;
1449 case htkImg: aai( (an == anAlt) || (an == anSrc) ); break;
1450 case htkInput:
1451 aai( (an == anType) || (an == anName) || (an == anValue) || (an == anAlt)
1452 || (an == anSize) || (an == anMaxlength) || (an == anChecked) ||
1453 (an == anDisabled) || (an == anReadonly) );
1454 break;
1455 case htkObject: aai( (an == anType) || (an == anDeclare) ); break;
1456 case htkOptgroup: aai( (an == anLabel) || (an == anDisabled) ); break;
1457 case htkOption:
1458 aai( (an == anValue) || (an == anLabel) || (an == anSelected) ||
1459 (an == anDisabled) );
1460 break;
1461 case htkSelect:
1462 aai( (an == anName) || (an == anMultiple) || (an == anDisabled) ); break;
1463 case htkTextarea:
1464 aai( (an == anName) || (an == anDisabled) || (an == anReadonly) ); break;
1465 case htkMeta:
1466 aai( (an == anName) || (an == anHttpEquiv) || (an == anContent) ); break;
1467 case htkFont:
1468 aai(an == anColor);
1469 #if TGC_IS_GRAPHICS
1470 else aai( (an == anSize) || (an == anFace) );
1471 #endif
1472 break;
1473 #if CONFIG_CSS
1474 case htkStyle: aai(an == anMedia); break;
1475 #endif
1476 }
1477
1478 if (current_attr_name != anUnknown)
1479 { an_append: append_attribute_name(current_attr_name); }
1480 an_ignore: {}
1481 break;
1482 case hpsAttrValue:
1483 if ( (bufsize > 1) && (current_attr_name != anUnknown) &&
1484 (current_attributes != NULL) &&
1485 (current_attributes->name == current_attr_name) )
1486 { /* (The two latter tests "should" be unnecessary.) */
1487 current_attributes->value = (shall_interpret_chents_in_attrvalue() ?
1488 prepare_text(truE) : my_strdup(buf));
1489 }
1490 break;
1491 }
1492
1493 #if CONFIG_DEBUG
1494 if (! ( ( (state == hpsText) || (state == hpsAttrName) ) && (bufsize < 2) ) )
1495 { char* spfbuf;
1496 my_spf(debugstrbuf, STRBUF_SIZE, &spfbuf, "%s: *%s%s*\n", hps_name[state],
1497 ( ( (state == hpsTag) && (is_endtag) ) ? (strSlash) : (strEmpty) ),
1498 ( (bufsize > 1) ? (buf) : ("(nothing)") ));
1499 prsdbg(spfbuf); my_spf_cleanup(debugstrbuf, spfbuf);
1500 }
1501 #endif
1502
1503 state = new_state; bufsize = 0;
1504 if (state == hpsAttrValue) attrvalue_quotes = 0;
1505 else if (state == hpsText) /* reached the end of an HTML tag */
1506 { if ( (current_tagkind != htkInvalid) || (current_attributes != NULL) )
1507 { /* reached the end of 1. a _known_ HTML tag or 2. an htkInvalid tag with
1508 "general" attributes */
1509 build_html_node();
1510 }
1511 }
1512 }
1513
1514 #undef aai
1515
parser_html_start(tCantent * cantent)1516 void parser_html_start(tCantent* cantent)
1517 /* prepares the parser for parsing the <cantent> */
1518 { size_t parsedsize, usedsize;
1519
1520 current_cantent = cantent;
1521 current_node_in_tree = (tHtmlNode*) cantent->tree;
1522 current_node = delayed_node = select_node = previous_node_in_tree = NULL;
1523
1524 /* If we haven't yet parsed the whole content, we might reach a point where
1525 we actually have to parse something, so we setup the parser here: */
1526 current_block = lhpp_content = cantent->lhpp_content;
1527 if (current_block != NULL)
1528 { parsedsize = lhpp_byte = cantent->lhpp_byte;
1529 dataptr = current_block->data + parsedsize;
1530 usedsize = current_block->used;
1531 if (parsedsize >= usedsize) current_block_sizeleft = 0;
1532 else current_block_sizeleft = usedsize - parsedsize;
1533 }
1534 else
1535 { parsedsize = lhpp_byte = cantent->lhpp_byte;
1536 dataptr = NULL; usedsize = 0; current_block_sizeleft = 0;
1537 }
1538 aebase = cantent->aebase; aenum = cantent->aenum; aemax = cantent->aemax;
1539
1540 state = hpsText;
1541 buf = current_unknown_tagname = NULL;
1542 maxbufsize = bufsize = 0;
1543
1544 current_tagkind = htkInvalid; current_attr_name = anUnknown;
1545 current_attributes = NULL;
1546 is_current_node_valid = is_parsing_done = inside_select = falsE;
1547
1548 #if CONFIG_DEBUG
1549 sprint_safe(debugstrbuf, "\nparser_html_start(): %p,%p,%p,%d,%d\n",
1550 cantent, current_block, dataptr, parsedsize, current_block_sizeleft);
1551 prsdbg(debugstrbuf);
1552 #endif
1553 }
1554
parser_html_next(tBoolean inside_pre)1555 const tHtmlNode* parser_html_next(tBoolean inside_pre)
1556 /* returns the next node (or NULL) for the currently parsed resource */
1557 { tBoolean found_whitespace = falsE;
1558 char ch;
1559 if (current_node_in_tree != NULL)
1560 { /* need not actually parse something, found a node inside the tree
1561 (generated during earlier passes) */
1562 tHtmlNode* retval = previous_node_in_tree = current_node_in_tree;
1563 current_node_in_tree = current_node_in_tree->next;
1564 return(retval);
1565 }
1566
1567 /* Reached the end of the already generated tree, have to parse: */
1568 loop:
1569 if (is_current_node_valid)
1570 { if (is_parsing_done)
1571 { tHtmlNode* retval = current_node; current_node = NULL; return(retval); }
1572 else { is_current_node_valid = falsE; return(current_node); }
1573 }
1574 if (current_block_sizeleft <= 0)
1575 { if (current_block != NULL) current_block = current_block->next;
1576 if (current_block != NULL)
1577 { dataptr = current_block->data;
1578 current_block_sizeleft = current_block->used;
1579 }
1580 else /* reached the end of the last content block in the list */
1581 { change_state(hpsDone);
1582 if (!is_current_node_valid)
1583 { current_node = NULL; is_current_node_valid = truE; } /* (extra care) */
1584 }
1585 goto loop;
1586 }
1587 ch = *dataptr++;
1588 switch (state)
1589 {case hpsText:
1590 if (ch == '<')
1591 { if (found_whitespace) { buf_append(' '); found_whitespace = falsE; }
1592 change_state(hpsTag); is_endtag = falsE;
1593 }
1594 else if (!inside_pre)
1595 { if (IS_WHITESPACE(ch)) found_whitespace = truE;
1596 else
1597 { if (found_whitespace) { buf_append(' '); found_whitespace = falsE; }
1598 buf_append(ch);
1599 }
1600 }
1601 else buf_append(ch);
1602 break;
1603 case hpsTag:
1604 if (IS_WHITESPACE(ch))
1605 { if (bufsize > 0) { change_state(hpsAttrName); tagblock_ends = falsE; } }
1606 else if (ch == '>') change_state(hpsText);
1607 else if (ch == '/')
1608 { if (bufsize == 0) is_endtag = truE;
1609 else { change_state(hpsAttrName); tagblock_ends = truE; }
1610 }
1611 else
1612 { buf_append(my_tolower(ch)); /* case-insensitivity: htmlspec, 3.2.1 */
1613 if ( (bufsize == 3) && (!strncmp(buf, strCommentTag, 3)) )
1614 change_state(hpsComment1);
1615 }
1616 break;
1617 case hpsAttrName:
1618 if (IS_WHITESPACE(ch)) { if (bufsize > 0) change_state(hpsEquals); }
1619 else if (ch == '=') change_state(hpsAttrValue);
1620 else if (ch == '>') change_state(hpsText);
1621 else if (ch == '/') tagblock_ends = truE; /* CHECKME: chg..(hpsAttrName)?*/
1622 else
1623 { attr_name_append:
1624 buf_append(my_tolower(ch)); /* case-insensitivity: htmlspec, 3.2.2 */
1625 }
1626 break;
1627 case hpsEquals:
1628 if (IS_WHITESPACE(ch)) { /* nothing */ }
1629 else if (ch == '=') change_state(hpsAttrValue);
1630 else if (ch == '>') change_state(hpsText);
1631 else { change_state(hpsAttrName); goto attr_name_append; } /* no value */
1632 break;
1633 case hpsAttrValue:
1634 if (IS_WHITESPACE(ch))
1635 { if (attrvalue_quotes != 0) buf_append(ch);
1636 else if (bufsize > 0) change_state(hpsAttrName);
1637 }
1638 else if (ch == '"')
1639 { if ( (attrvalue_quotes == 0) && (bufsize == 0) ) attrvalue_quotes = 2;
1640 else if (attrvalue_quotes == 2) change_state(hpsAttrName); /* value end*/
1641 else buf_append(ch);
1642 }
1643 else if (ch == '\'')
1644 { if ( (attrvalue_quotes == 0) && (bufsize == 0) ) attrvalue_quotes = 1;
1645 else if (attrvalue_quotes == 1) change_state(hpsAttrName); /* value end*/
1646 else buf_append(ch);
1647 }
1648 else if ( (ch == '>') && (attrvalue_quotes == 0) ) change_state(hpsText);
1649 else buf_append(ch);
1650 break;
1651 /* For most of the hpsComment states, we need not call change_state()
1652 because these are rather some kind of "sub-states": */
1653 case hpsComment1:
1654 if (ch == '-') state = hpsComment2;
1655 break;
1656 case hpsComment2:
1657 if (ch == '-') state = hpsComment3;
1658 else state = hpsComment1;
1659 break;
1660 case hpsComment3:
1661 if (ch == '>') change_state(hpsText);
1662 else if ( (!IS_WHITESPACE(ch)) && (ch != '-') ) state = hpsComment1;
1663 /* "else": stick to hpsComment3! htmlspec, 3.2.4: "White space is not per-
1664 mitted between the markup declaration open delimiter("<!") and the com-
1665 ment open delimiter ("--"), but is permitted between the comment close
1666 delimiter ("--") and the markup declaration close delimiter (">")."
1667 Additionally, we leniently allow extra "-" characters because a web page
1668 author might accidentally write e.g. "--->" instead of "-->"... */
1669 break;
1670 }
1671 current_block_sizeleft--;
1672 goto loop;
1673 }
1674
parser_html_finish(void)1675 void parser_html_finish(void)
1676 /* finishes the parsing of the current cantent */
1677 { if (inside_select)
1678 { deallocate_one_aebase(&select_aebase);
1679 if (select_node != NULL) deallocate_html_node(select_node);
1680 }
1681 if (delayed_node != NULL) deallocate_html_node(delayed_node);
1682 deallocate_attributes(current_attributes);
1683 current_cantent->lhpp_content = lhpp_content;
1684 current_cantent->lhpp_byte = lhpp_byte;
1685 current_cantent->aebase = aebase;
1686 current_cantent->aenum = aenum; current_cantent->aemax = aemax;
1687 __dealloc(buf);
1688 i18n_cleanup /* FIXME on interface change: only do this if parser usedepth is
1689 zero! */
1690 }
1691
parser_initialize(void)1692 one_caller void __init parser_initialize(void)
1693 {
1694 #if CONFIG_DEBUG
1695 static const char headline[] = "retawq " RETAWQ_VERSION
1696 " HTML parser debugging file (<http://retawq.sourceforge.net/>)\n";
1697 fd_parsertest = my_create("htmldebug.txt", O_CREAT | O_TRUNC | O_WRONLY,
1698 S_IRUSR | S_IWUSR);
1699 if (fd_parsertest < 0)
1700 fatal_error(errno, "can't create HTML parser debugging file");
1701 make_fd_cloexec(fd_parsertest);
1702 prsdbg(headline);
1703 #endif
1704 }
1705