1 /*
2  * File: html.cc
3  *
4  * Copyright (C) 2005-2007 Jorge Arellano Cid <jcid@dillo.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  */
11 
12 /*
13  * Dillo HTML parsing routines
14  */
15 
16 /*-----------------------------------------------------------------------------
17  * Includes
18  *---------------------------------------------------------------------------*/
19 #include <ctype.h>      /* for isspace */
20 #include <string.h>     /* for memcpy and memmove */
21 #include <stdlib.h>
22 #include <stdio.h>      /* for sprintf */
23 #include <errno.h>
24 
25 #include "bw.h"         /* for BrowserWindow */
26 #include "msg.h"
27 #include "binaryconst.h"
28 #include "colors.h"
29 #include "utf8.hh"
30 
31 #include "misc.h"
32 #include "uicmd.hh"
33 #include "history.h"
34 #include "menu.hh"
35 #include "prefs.h"
36 #include "capi.h"
37 #include "html.hh"
38 #include "html_common.hh"
39 #include "form.hh"
40 #include "table.hh"
41 
42 #include "dw/textblock.hh"
43 #include "dw/bullet.hh"
44 #include "dw/listitem.hh"
45 #include "dw/image.hh"
46 #include "dw/ruler.hh"
47 
48 /*-----------------------------------------------------------------------------
49  * Defines
50  *---------------------------------------------------------------------------*/
51 
52 /* Define to 1 to ignore white space immediately after an open tag,
53  * and immediately before a close tag. */
54 #define SGML_SPCDEL 0
55 
56 #define TAB_SIZE 8
57 
58 /*-----------------------------------------------------------------------------
59  * Name spaces
60  *---------------------------------------------------------------------------*/
61 using namespace lout;
62 using namespace dw;
63 using namespace dw::core;
64 using namespace dw::core::ui;
65 using namespace dw::core::style;
66 
67 /*-----------------------------------------------------------------------------
68  * Typedefs
69  *---------------------------------------------------------------------------*/
70 class DilloHtml;
71 typedef void (*TagOpenFunct) (DilloHtml *html, const char *tag, int tagsize);
72 typedef void (*TagCloseFunct) (DilloHtml *html);
73 
74 typedef enum {
75    SEEK_ATTR_START,
76    MATCH_ATTR_NAME,
77    SEEK_TOKEN_START,
78    SEEK_VALUE_START,
79    SKIP_VALUE,
80    GET_VALUE,
81    FINISHED
82 } DilloHtmlTagParsingState;
83 
84 typedef enum {
85    HTML_LeftTrim      = 1 << 0,
86    HTML_RightTrim     = 1 << 1,
87    HTML_ParseEntities = 1 << 2
88 } DilloHtmlTagParsingFlags;
89 
90 
91 /*
92  * Exported function with C linkage.
93  */
94 extern "C" {
95 void *a_Html_text(const char *type, void *P, CA_Callback_t *Call,void **Data);
96 }
97 
98 /*-----------------------------------------------------------------------------
99  * Forward declarations
100  *---------------------------------------------------------------------------*/
101 static int Html_write_raw(DilloHtml *html, char *buf, int bufsize, int Eof);
102 static bool Html_load_image(BrowserWindow *bw, DilloUrl *url,
103                             const DilloUrl *requester, DilloImage *image);
104 static void Html_callback(int Op, CacheClient_t *Client);
105 static void Html_tag_cleanup_at_close(DilloHtml *html, int TagIdx);
106 
107 /*-----------------------------------------------------------------------------
108  * Local Data
109  *---------------------------------------------------------------------------*/
110 /* Parsing table structure */
111 typedef struct {
112    const char *name;      /* element name */
113    unsigned char Flags;   /* flags (explained near the table data) */
114    char EndTag;           /* Is it Required, Optional or Forbidden */
115    uchar_t TagLevel;      /* Used to heuristically parse bad HTML  */
116    TagOpenFunct open;     /* Open function */
117    TagOpenFunct content;  /* Content function */
118    TagCloseFunct close;   /* Close function */
119 } TagInfo;
120 extern const TagInfo Tags[];
121 
122 /*-----------------------------------------------------------------------------
123  *-----------------------------------------------------------------------------
124  * Main Code
125  *-----------------------------------------------------------------------------
126  *---------------------------------------------------------------------------*/
127 
128 /*
129  * Collect HTML error strings.
130  */
bugMessage(const char * format,...)131 void DilloHtml::bugMessage(const char *format, ... )
132 {
133    va_list argp;
134 
135    if (bw->num_page_bugs)
136       dStr_append_c(bw->page_bugs, '\n');
137    dStr_sprintfa(bw->page_bugs,
138                  "HTML warning: line %d, ",
139                  getCurrLineNumber());
140    va_start(argp, format);
141    dStr_vsprintfa(bw->page_bugs, format, argp);
142    va_end(argp);
143    a_UIcmd_set_bug_prog(bw, ++bw->num_page_bugs);
144 }
145 
146 /*
147  * Wrapper for a_Url_new that adds an error detection message.
148  * If use_base_url is TRUE, it uses base_url. Otherwise it uses html->base_url.
149  */
a_Html_url_new(DilloHtml * html,const char * url_str,const char * base_url,int use_base_url)150 DilloUrl *a_Html_url_new(DilloHtml *html,
151                          const char *url_str, const char *base_url,
152                          int use_base_url)
153 {
154    DilloUrl *url;
155    int n_ic, n_ic_spc;
156 
157    url = a_Url_new(url_str,
158                    (use_base_url) ? base_url : URL_STR_(html->base_url));
159    if ((n_ic = URL_ILLEGAL_CHARS(url)) != 0) {
160       const char *suffix = (n_ic) > 1 ? "s" : "";
161       n_ic_spc = URL_ILLEGAL_CHARS_SPC(url);
162       if (n_ic == n_ic_spc) {
163          BUG_MSG("URL has %d illegal space%s ('%s').", n_ic, suffix, url_str);
164       } else if (n_ic_spc == 0) {
165          BUG_MSG("URL has %d illegal byte%s in {00-1F, 7F-FF} range ('%s').",
166                  n_ic, suffix, url_str);
167       } else {
168          BUG_MSG("URL has %d illegal byte%s: "
169                  "%d space%s and %d in {00-1F, 7F-FF} range ('%s').",
170                  n_ic, suffix,
171                  n_ic_spc, n_ic_spc > 1 ? "s" : "", n_ic-n_ic_spc, url_str);
172       }
173    }
174    return url;
175 }
176 
177 /*
178  * Set callback function and callback data for the "html/text" MIME type.
179  */
a_Html_text(const char * Type,void * P,CA_Callback_t * Call,void ** Data)180 void *a_Html_text(const char *Type, void *P, CA_Callback_t *Call, void **Data)
181 {
182    DilloWeb *web = (DilloWeb*)P;
183    DilloHtml *html = new DilloHtml(web->bw, web->url, Type);
184 
185    *Data = (void*)html;
186    *Call = (CA_Callback_t)Html_callback;
187 
188    return (void*)html->dw;
189 }
190 
Html_free(void * data)191 static void Html_free(void *data)
192 {
193    delete ((DilloHtml*)data);
194 }
195 
196 /*
197  * Used by the "Load images" page menuitem.
198  */
a_Html_load_images(void * v_html,DilloUrl * pattern)199 void a_Html_load_images(void *v_html, DilloUrl *pattern)
200 {
201    DilloHtml *html = (DilloHtml*)v_html;
202 
203    html->loadImages(pattern);
204 }
205 
206 /*
207  * Search for form
208  */
Html_contains_form(DilloHtml * html,void * v_form)209 static bool Html_contains_form(DilloHtml *html, void *v_form)
210 {
211    for (int i = 0; i < html->forms->size(); i++) {
212       if (html->forms->get(i) == v_form) {
213          return true;
214       }
215    }
216    return false;
217 }
218 
219 /*
220  * Used by the "Submit form" form menuitem.
221  */
a_Html_form_submit(void * v_html,void * v_form)222 void a_Html_form_submit(void *v_html, void *v_form)
223 {
224    DilloHtml *html = (DilloHtml*)v_html;
225 
226    if (Html_contains_form(html, v_form)) {
227       /* it's still valid */
228      a_Html_form_submit2(v_form);
229    }
230 }
231 
232 /*
233  * Used by the "Reset form" form menuitem.
234  */
a_Html_form_reset(void * v_html,void * v_form)235 void a_Html_form_reset(void *v_html, void *v_form)
236 {
237    DilloHtml *html = (DilloHtml*)v_html;
238 
239    if (Html_contains_form(html, v_form)) {
240       /* it's still valid */
241      a_Html_form_reset2(v_form);
242    }
243 }
244 
245 /*
246  * Used by the "Show/Hide hiddens" form menuitem.
247  */
a_Html_form_display_hiddens(void * v_html,void * v_form,bool_t display)248 void a_Html_form_display_hiddens(void *v_html, void *v_form, bool_t display)
249 {
250    DilloHtml *html = (DilloHtml*)v_html;
251 
252    if (Html_contains_form(html, v_form)) {
253       /* it's still valid */
254       a_Html_form_display_hiddens2(v_form, (display != 0));
255    }
256 }
257 
258 /*
259  * Set the URL data for image maps.
260  */
Html_set_link_coordinates(DilloHtml * html,int link,int x,int y)261 static void Html_set_link_coordinates(DilloHtml *html, int link, int x, int y)
262 {
263    char data[64];
264 
265    if (x != -1) {
266       snprintf(data, 64, "?%d,%d", x, y);
267       a_Url_set_ismap_coords(html->links->get(link), data);
268    }
269 }
270 
271 /*
272  * Create a new link, set it as the url's parent
273  * and return the index.
274  */
Html_set_new_link(DilloHtml * html,DilloUrl ** url)275 static int Html_set_new_link(DilloHtml *html, DilloUrl **url)
276 {
277    int nl = html->links->size();
278    html->links->increase();
279    html->links->set(nl, (*url) ? *url : NULL);
280    return nl;
281 }
282 
283 /*
284  * Evaluates the ALIGN attribute (left|center|right|justify) and
285  * sets the style at the top of the stack.
286  */
a_Html_tag_set_align_attr(DilloHtml * html,const char * tag,int tagsize)287 void a_Html_tag_set_align_attr(DilloHtml *html, const char *tag, int tagsize)
288 {
289    const char *align;
290 
291    if ((align = a_Html_get_attr(html, tag, tagsize, "align"))) {
292       TextAlignType textAlignType = TEXT_ALIGN_LEFT;
293 
294       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
295          BUG_MSG("The align attribute is obsolete in HTML5.");
296 
297       if (dStrAsciiCasecmp (align, "left") == 0)
298          textAlignType = TEXT_ALIGN_LEFT;
299       else if (dStrAsciiCasecmp (align, "right") == 0)
300          textAlignType = TEXT_ALIGN_RIGHT;
301       else if (dStrAsciiCasecmp (align, "center") == 0)
302          textAlignType = TEXT_ALIGN_CENTER;
303       else if (dStrAsciiCasecmp (align, "justify") == 0)
304          textAlignType = TEXT_ALIGN_JUSTIFY;
305 #if 0
306       else if (dStrAsciiCasecmp (align, "char") == 0) {
307          /* TODO: Actually not supported for <p> etc. */
308          v.textAlign = TEXT_ALIGN_STRING;
309          if ((charattr = a_Html_get_attr(html, tag, tagsize, "char"))) {
310             if (charattr[0] == 0)
311                /* TODO: ALIGN=" ", and even ALIGN="&32;" will reult in
312                 * an empty string (don't know whether the latter is
313                 * correct, has to be clarified with the specs), so
314                 * that for empty strings, " " is assumed. */
315                style_attrs.textAlignChar = ' ';
316             else
317                style_attrs.textAlignChar = charattr[0];
318          } else
319             /* TODO: Examine LANG attr of <html>. */
320             style_attrs.textAlignChar = '.';
321       }
322 #endif
323       html->styleEngine->setNonCssHint(CSS_PROPERTY_TEXT_ALIGN, CSS_TYPE_ENUM,
324                                        textAlignType);
325    }
326 }
327 
328 /*
329  * Evaluates the VALIGN attribute (top|bottom|middle|baseline) and
330  * sets the style in style_attrs. Returns true when set.
331  */
a_Html_tag_set_valign_attr(DilloHtml * html,const char * tag,int tagsize)332 bool a_Html_tag_set_valign_attr(DilloHtml *html, const char *tag, int tagsize)
333 {
334    const char *attr;
335    VAlignType valign;
336 
337    if ((attr = a_Html_get_attr(html, tag, tagsize, "valign"))) {
338       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
339          BUG_MSG("The valign attribute is obsolete in HTML5.");
340 
341       if (dStrAsciiCasecmp (attr, "top") == 0)
342          valign = VALIGN_TOP;
343       else if (dStrAsciiCasecmp (attr, "bottom") == 0)
344          valign = VALIGN_BOTTOM;
345       else if (dStrAsciiCasecmp (attr, "baseline") == 0)
346          valign = VALIGN_BASELINE;
347       else
348          valign = VALIGN_MIDDLE;
349 
350       html->styleEngine->setNonCssHint (CSS_PROPERTY_VERTICAL_ALIGN,
351                                         CSS_TYPE_ENUM, valign);
352       return true;
353    } else
354       return false;
355 }
356 
357 
358 /*
359  * Create and add a new Textblock to the current Textblock
360  */
Html_add_textblock(DilloHtml * html,int space)361 static void Html_add_textblock(DilloHtml *html, int space)
362 {
363    Textblock *textblock = new Textblock (prefs.limit_text_width);
364 
365    HT2TB(html)->addParbreak (space, html->wordStyle ());
366    HT2TB(html)->addWidget (textblock, html->style ());
367    HT2TB(html)->addParbreak (space, html->wordStyle ());
368    S_TOP(html)->textblock = html->dw = textblock;
369    S_TOP(html)->hand_over_break = true;
370 }
371 
372 /*
373  * Create and initialize a new DilloHtml class
374  */
DilloHtml(BrowserWindow * p_bw,const DilloUrl * url,const char * content_type)375 DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url,
376                      const char *content_type)
377 {
378    /* Init main variables */
379    bw = p_bw;
380    page_url = a_Url_dup(url);
381    base_url = a_Url_dup(url);
382    dw = NULL;
383 
384    /* Init event receiver */
385    linkReceiver.html = this;
386    HT2LT(this)->connectLink (&linkReceiver);
387 
388    a_Bw_add_doc(p_bw, this);
389 
390    /* Init for-parsing variables */
391    Start_Buf = NULL;
392    Start_Ofs = 0;
393 
394    _MSG("DilloHtml(): content type: %s\n", content_type);
395    this->content_type = dStrdup(content_type);
396 
397    /* get charset */
398    a_Misc_parse_content_type(content_type, NULL, NULL, &charset);
399 
400    stop_parser = false;
401 
402    CurrOfs = OldOfs = 0;
403    OldLine = 1;
404 
405    DocType = DT_NONE;    /* assume Tag Soup 0.0!   :-) */
406    DocTypeVersion = 0.0f;
407 
408    styleEngine = new StyleEngine (HT2LT (this), page_url, base_url);
409 
410    cssUrls = new misc::SimpleVector <DilloUrl*> (1);
411 
412    stack = new misc::SimpleVector <DilloHtmlState> (16);
413    stack->increase();
414    stack->getRef(0)->parse_mode = DILLO_HTML_PARSE_MODE_INIT;
415    stack->getRef(0)->table_mode = DILLO_HTML_TABLE_MODE_NONE;
416    stack->getRef(0)->table_border_mode = DILLO_HTML_TABLE_BORDER_SEPARATE;
417    stack->getRef(0)->cell_text_align_set = false;
418    stack->getRef(0)->display_none = false;
419    stack->getRef(0)->list_type = HTML_LIST_NONE;
420    stack->getRef(0)->list_number = 0;
421    stack->getRef(0)->tag_idx = -1;               /* MUST not be used */
422    stack->getRef(0)->textblock = NULL;
423    stack->getRef(0)->table = NULL;
424    stack->getRef(0)->ref_list_item = NULL;
425    stack->getRef(0)->hand_over_break = false;
426 
427    InFlags = IN_NONE;
428 
429    Stash = dStr_new("");
430    StashSpace = false;
431 
432    pre_column = 0;
433    PreFirstChar = false;
434    PrevWasCR = false;
435    InVisitedLink = false;
436    ReqTagClose = false;
437    TagSoup = true;
438    loadCssFromStash = false;
439 
440    Num_HTML = Num_HEAD = Num_BODY = Num_TITLE = 0;
441 
442    attr_data = dStr_sized_new(1024);
443 
444    non_css_link_color = -1;
445    non_css_visited_color = -1;
446    visited_color = -1;
447 
448    /* Init page-handling variables */
449    forms = new misc::SimpleVector <DilloHtmlForm*> (1);
450    inputs_outside_form = new misc::SimpleVector <DilloHtmlInput*> (1);
451    links = new misc::SimpleVector <DilloUrl*> (64);
452    images = new misc::SimpleVector <DilloHtmlImage*> (16);
453 
454    /* Initialize the main widget */
455    initDw();
456    /* Hook destructor to the dw delete call */
457    dw->setDeleteCallback(Html_free, this);
458 }
459 
460 /*
461  * Miscellaneous initializations for Dw
462  */
initDw()463 void DilloHtml::initDw()
464 {
465    dReturn_if_fail (dw == NULL);
466 
467    /* Create the main widget */
468    dw = stack->getRef(0)->textblock =  new Textblock (prefs.limit_text_width);
469 
470    bw->num_page_bugs = 0;
471    dStr_truncate(bw->page_bugs, 0);
472 }
473 
474 /*
475  * Free memory used by the DilloHtml class.
476  */
~DilloHtml()477 DilloHtml::~DilloHtml()
478 {
479    _MSG("::~DilloHtml(this=%p)\n", this);
480 
481    freeParseData();
482 
483    a_Bw_remove_doc(bw, this);
484 
485    a_Url_free(page_url);
486    a_Url_free(base_url);
487 
488    for (int i = 0; i < cssUrls->size(); i++)
489       a_Url_free(cssUrls->get(i));
490    delete (cssUrls);
491 
492    for (int i = 0; i < forms->size(); i++)
493       a_Html_form_delete (forms->get(i));
494    delete(forms);
495 
496    for (int i = 0; i < inputs_outside_form->size(); i++)
497       a_Html_input_delete(inputs_outside_form->get(i));
498    delete(inputs_outside_form);
499 
500    for (int i = 0; i < links->size(); i++)
501       a_Url_free(links->get(i));
502    delete (links);
503 
504    for (int i = 0; i < images->size(); i++) {
505       DilloHtmlImage *img = images->get(i);
506       a_Url_free(img->url);
507       a_Image_unref(img->image);
508       dFree(img);
509    }
510    delete (images);
511 
512    delete styleEngine;
513 }
514 
515 /*
516  * Process the newly arrived html and put it into the page structure.
517  * (This function is called by Html_callback whenever there's new data)
518  */
write(char * Buf,int BufSize,int Eof)519 void DilloHtml::write(char *Buf, int BufSize, int Eof)
520 {
521    int token_start;
522    char *buf = Buf + Start_Ofs;
523    int bufsize = BufSize - Start_Ofs;
524 
525    _MSG("DilloHtml::write BufSize=%d Start_Ofs=%d\n", BufSize, Start_Ofs);
526 #if 0
527    char *aux = dStrndup(Buf, BufSize);
528    MSG(" {%s}\n", aux);
529    dFree(aux);
530 #endif
531 
532    /* Update Start_Buf. It may be used after the parser is stopped */
533    Start_Buf = Buf;
534 
535    dReturn_if (dw == NULL);
536    dReturn_if (stop_parser == true);
537 
538    token_start = Html_write_raw(this, buf, bufsize, Eof);
539    Start_Ofs += token_start;
540 }
541 
542 /*
543  * Return the line number of the tag/word being processed by the parser.
544  * Also update the offsets.
545  */
getCurrLineNumber()546 int DilloHtml::getCurrLineNumber()
547 {
548    int i, ofs, line;
549    const char *p = Start_Buf;
550 
551    dReturn_val_if_fail(p != NULL, -1);
552    /* Disable line counting for META hack. Buffers differ. */
553    dReturn_val_if((InFlags & IN_META_HACK), -1);
554 
555    ofs = CurrOfs;
556    line = OldLine;
557    for (i = OldOfs; i < ofs; ++i)
558       if (p[i] == '\n' || (p[i] == '\r' && p[i+1] != '\n'))
559          ++line;
560    OldOfs = CurrOfs;
561    OldLine = line;
562    return line;
563 }
564 
565 /*
566  * Free parsing data.
567  */
freeParseData()568 void DilloHtml::freeParseData()
569 {
570    delete(stack);
571 
572    dStr_free(Stash, TRUE);
573    dStr_free(attr_data, TRUE);
574    dFree(content_type);
575    dFree(charset);
576 }
577 
578 /*
579  * Finish parsing a HTML page. Close the parser and close the client.
580  * The class is not deleted here, it remains until the widget is destroyed.
581  */
finishParsing(int ClientKey)582 void DilloHtml::finishParsing(int ClientKey)
583 {
584    int si;
585 
586    dReturn_if (stop_parser == true);
587 
588    /* flag we've already parsed up to the last byte */
589    InFlags |= IN_EOF;
590 
591    /* force the close of elements left open (TODO: not for XHTML) */
592    while ((si = stack->size() - 1)) {
593       if (stack->getRef(si)->tag_idx != -1) {
594          Html_tag_cleanup_at_close(this, stack->getRef(si)->tag_idx);
595       }
596    }
597 
598    /* Nothing left to do with the parser. Clear all flags, except EOF. */
599    InFlags = IN_EOF;
600 
601    /* Remove this client from our active list */
602    a_Bw_close_client(bw, ClientKey);
603 }
604 
605 /*
606  * Allocate and insert form information.
607  */
formNew(DilloHtmlMethod method,const DilloUrl * action,DilloHtmlEnc enc,const char * charset)608 int DilloHtml::formNew(DilloHtmlMethod method, const DilloUrl *action,
609                        DilloHtmlEnc enc, const char *charset)
610 {
611    // avoid data loss on repush after CSS stylesheets have been loaded
612    bool enabled = bw->NumPendingStyleSheets == 0;
613    DilloHtmlForm *form = a_Html_form_new (this, method, action,
614                                           enc, charset, enabled);
615    int nf = forms->size ();
616    forms->increase ();
617    forms->set (nf, form);
618    _MSG("Html formNew: action=%s nform=%d\n", action, nf);
619    return forms->size();
620 }
621 
622 /*
623  * Get the current form.
624  */
getCurrentForm()625 DilloHtmlForm *DilloHtml::getCurrentForm ()
626 {
627    return forms->get (forms->size() - 1);
628 }
629 
unloadedImages()630 bool_t DilloHtml::unloadedImages()
631 {
632    for (int i = 0; i < images->size(); i++) {
633       if (images->get(i)->image != NULL) {
634          return TRUE;
635       }
636    }
637    return FALSE;
638 }
639 
640 /*
641  * Load images if they were disabled.
642  */
loadImages(const DilloUrl * pattern)643 void DilloHtml::loadImages (const DilloUrl *pattern)
644 {
645    dReturn_if (a_Bw_expecting(bw));
646 
647    /* If the user asked for a specific image, the user (NULL) is the requester,
648     * and the domain mechanism will always permit the request. But if the user
649     * just asked for all images (clicking "Load images"), use the page URL as
650     * the requester so that the domain mechanism can act as a filter.
651     * If the possible patterns become more complex, it might be good to have
652     * the caller supply the requester instead.
653     */
654    const DilloUrl *requester = pattern ? NULL : this->page_url;
655 
656    for (int i = 0; i < images->size(); i++) {
657       DilloHtmlImage *hi = images->get(i);
658 
659       if (hi->image) {
660          assert(hi->url);
661          if ((!pattern) || (!a_Url_cmp(hi->url, pattern))) {
662             if (Html_load_image(bw, hi->url, requester, hi->image)) {
663                a_Image_unref (hi->image);
664                hi->image = NULL;  // web owns it now
665             }
666          }
667       }
668    }
669 }
670 
671 /*
672  * Save URL in a vector (may be loaded later).
673  */
addCssUrl(const DilloUrl * url)674 void DilloHtml::addCssUrl(const DilloUrl *url)
675 {
676    int nu = cssUrls->size();
677    cssUrls->increase();
678    cssUrls->set(nu, a_Url_dup(url));
679 }
680 
enter(Widget * widget,int link,int img,int x,int y)681 bool DilloHtml::HtmlLinkReceiver::enter (Widget *widget, int link, int img,
682                                          int x, int y)
683 {
684    BrowserWindow *bw = html->bw;
685 
686    _MSG(" ** ");
687    if (link == -1) {
688       _MSG(" Link  LEAVE  notify...\n");
689       a_UIcmd_set_msg(bw, "");
690    } else {
691       _MSG(" Link  ENTER  notify...\n");
692       Html_set_link_coordinates(html, link, x, y);
693       a_UIcmd_set_msg(bw, "%s", URL_STR(html->links->get(link)));
694    }
695    return true;
696 }
697 
698 /*
699  * Handle the "press" signal.
700  */
press(Widget * widget,int link,int img,int x,int y,EventButton * event)701 bool DilloHtml::HtmlLinkReceiver::press (Widget *widget, int link, int img,
702                                          int x, int y, EventButton *event)
703 {
704    BrowserWindow *bw = html->bw;
705    int ret = false;
706    DilloUrl *linkurl = NULL;
707 
708    _MSG("pressed button %d\n", event->button);
709    if (event->button == 3) {
710       // popup menus
711       if (img != -1) {
712          // image menu
713          if (link != -1)
714             linkurl = html->links->get(link);
715          const bool_t loaded_img = (html->images->get(img)->image == NULL);
716          a_UIcmd_image_popup(bw, html->images->get(img)->url, loaded_img,
717                              html->page_url, linkurl);
718          ret = true;
719       } else {
720          if (link == -1) {
721             a_UIcmd_page_popup(bw, bw->num_page_bugs != 0, html->cssUrls);
722             ret = true;
723          } else {
724             a_UIcmd_link_popup(bw, html->links->get(link));
725             ret = true;
726          }
727       }
728    }
729    return ret;
730 }
731 
732 /*
733  * Handle the "click" signal.
734  */
click(Widget * widget,int link,int img,int x,int y,EventButton * event)735 bool DilloHtml::HtmlLinkReceiver::click (Widget *widget, int link, int img,
736                                          int x, int y, EventButton *event)
737 {
738    BrowserWindow *bw = html->bw;
739 
740    if ((img != -1) && (html->images->get(img)->image)) {
741       // clicked an image that has not already been loaded
742       if (event->button == 1){
743          // load all instances of this image
744          DilloUrl *pattern = html->images->get(img)->url;
745          html->loadImages(pattern);
746          return true;
747       }
748    }
749 
750    if (link != -1) {
751       DilloUrl *url = html->links->get(link);
752       _MSG("clicked on URL %d: %s\n", link, a_Url_str (url));
753 
754       Html_set_link_coordinates(html, link, x, y);
755 
756       if (event->button == 1) {
757          a_UIcmd_open_url(bw, url);
758       } else if (event->button == 2) {
759          if (prefs.middle_click_opens_new_tab) {
760             int focus = prefs.focus_new_tab ? 1 : 0;
761             if (event->state == SHIFT_MASK) focus = !focus;
762             a_UIcmd_open_url_nt(bw, url, focus);
763          } else
764             a_UIcmd_open_url_nw(bw, url);
765       } else {
766          return false;
767       }
768 
769       /* Change the link color to "visited" as visual feedback */
770       for (Widget *w = widget; w; w = w->getParent()) {
771          _MSG("  ->%s\n", w->getClassName());
772          if (w->instanceOf(dw::Textblock::CLASS_ID)) {
773             ((Textblock*)w)->changeLinkColor (link, html->visited_color);
774             break;
775          }
776       }
777    }
778    return true;
779 }
780 
781 /*
782  * Initialize the stash buffer
783  */
a_Html_stash_init(DilloHtml * html)784 void a_Html_stash_init(DilloHtml *html)
785 {
786    S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_STASH;
787    html->StashSpace = false;
788    dStr_truncate(html->Stash, 0);
789 }
790 
791 /* Entities list from the HTML 4.01 DTD */
792 typedef struct {
793    const char *entity;
794    int isocode;
795 } Ent_t;
796 
797 #define NumEnt 252
798 static const Ent_t Entities[NumEnt] = {
799    {"AElig",0306}, {"Aacute",0301}, {"Acirc",0302},  {"Agrave",0300},
800    {"Alpha",01621},{"Aring",0305},  {"Atilde",0303}, {"Auml",0304},
801    {"Beta",01622}, {"Ccedil",0307}, {"Chi",01647},   {"Dagger",020041},
802    {"Delta",01624},{"ETH",0320},    {"Eacute",0311}, {"Ecirc",0312},
803    {"Egrave",0310},{"Epsilon",01625},{"Eta",01627},  {"Euml",0313},
804    {"Gamma",01623},{"Iacute",0315}, {"Icirc",0316},  {"Igrave",0314},
805    {"Iota",01631}, {"Iuml",0317},   {"Kappa",01632}, {"Lambda",01633},
806    {"Mu",01634},   {"Ntilde",0321}, {"Nu",01635},    {"OElig",0522},
807    {"Oacute",0323},{"Ocirc",0324},  {"Ograve",0322}, {"Omega",01651},
808    {"Omicron",01637},{"Oslash",0330},{"Otilde",0325},{"Ouml",0326},
809    {"Phi",01646},  {"Pi",01640},    {"Prime",020063},{"Psi",01650},
810    {"Rho",01641},  {"Scaron",0540}, {"Sigma",01643}, {"THORN",0336},
811    {"Tau",01644},  {"Theta",01630}, {"Uacute",0332}, {"Ucirc",0333},
812    {"Ugrave",0331},{"Upsilon",01645},{"Uuml",0334},  {"Xi",01636},
813    {"Yacute",0335},{"Yuml",0570},   {"Zeta",01626},  {"aacute",0341},
814    {"acirc",0342}, {"acute",0264},  {"aelig",0346},  {"agrave",0340},
815    {"alefsym",020465},{"alpha",01661},{"amp",38},    {"and",021047},
816    {"ang",021040}, {"aring",0345},  {"asymp",021110},{"atilde",0343},
817    {"auml",0344},  {"bdquo",020036},{"beta",01662},  {"brvbar",0246},
818    {"bull",020042},{"cap",021051},  {"ccedil",0347}, {"cedil",0270},
819    {"cent",0242},  {"chi",01707},   {"circ",01306},  {"clubs",023143},
820    {"cong",021105},{"copy",0251},   {"crarr",020665},{"cup",021052},
821    {"curren",0244},{"dArr",020723}, {"dagger",020040},{"darr",020623},
822    {"deg",0260},   {"delta",01664}, {"diams",023146},{"divide",0367},
823    {"eacute",0351},{"ecirc",0352},  {"egrave",0350}, {"empty",021005},
824    {"emsp",020003},{"ensp",020002}, {"epsilon",01665},{"equiv",021141},
825    {"eta",01667},  {"eth",0360},    {"euml",0353},   {"euro",020254},
826    {"exist",021003},{"fnof",0622},  {"forall",021000},{"frac12",0275},
827    {"frac14",0274},{"frac34",0276}, {"frasl",020104},{"gamma",01663},
828    {"ge",021145},  {"gt",62},       {"hArr",020724}, {"harr",020624},
829    {"hearts",023145},{"hellip",020046},{"iacute",0355},{"icirc",0356},
830    {"iexcl",0241}, {"igrave",0354}, {"image",020421},{"infin",021036},
831    {"int",021053}, {"iota",01671},  {"iquest",0277}, {"isin",021010},
832    {"iuml",0357},  {"kappa",01672}, {"lArr",020720}, {"lambda",01673},
833    {"lang",021451},{"laquo",0253},  {"larr",020620}, {"lceil",021410},
834    {"ldquo",020034},{"le",021144},  {"lfloor",021412},{"lowast",021027},
835    {"loz",022712}, {"lrm",020016},  {"lsaquo",020071},{"lsquo",020030},
836    {"lt",60},      {"macr",0257},   {"mdash",020024},{"micro",0265},
837    {"middot",0267},{"minus",021022},{"mu",01674},    {"nabla",021007},
838    {"nbsp",0240},  {"ndash",020023},{"ne",021140},   {"ni",021013},
839    {"not",0254},   {"notin",021011},{"nsub",021204}, {"ntilde",0361},
840    {"nu",01675},   {"oacute",0363}, {"ocirc",0364},  {"oelig",0523},
841    {"ograve",0362},{"oline",020076},{"omega",01711}, {"omicron",01677},
842    {"oplus",021225},{"or",021050},  {"ordf",0252},   {"ordm",0272},
843    {"oslash",0370},{"otilde",0365}, {"otimes",021227},{"ouml",0366},
844    {"para",0266},  {"part",021002}, {"permil",020060},{"perp",021245},
845    {"phi",01706},  {"pi",01700},    {"piv",01726},   {"plusmn",0261},
846    {"pound",0243}, {"prime",020062},{"prod",021017}, {"prop",021035},
847    {"psi",01710},  {"quot",34},     {"rArr",020722}, {"radic",021032},
848    {"rang",021452},{"raquo",0273},  {"rarr",020622}, {"rceil",021411},
849    {"rdquo",020035},{"real",020434},{"reg",0256},    {"rfloor",021413},
850    {"rho",01701},  {"rlm",020017},  {"rsaquo",020072},{"rsquo",020031},
851    {"sbquo",020032},{"scaron",0541},{"sdot",021305}, {"sect",0247},
852    {"shy",0255},   {"sigma",01703}, {"sigmaf",01702},{"sim",021074},
853    {"spades",023140},{"sub",021202},{"sube",021206}, {"sum",021021},
854    {"sup",021203}, {"sup1",0271},   {"sup2",0262},   {"sup3",0263},
855    {"supe",021207},{"szlig",0337},  {"tau",01704},   {"there4",021064},
856    {"theta",01670},{"thetasym",01721},{"thinsp",020011},{"thorn",0376},
857    {"tilde",01334},{"times",0327},  {"trade",020442},{"uArr",020721},
858    {"uacute",0372},{"uarr",020621}, {"ucirc",0373},  {"ugrave",0371},
859    {"uml",0250},   {"upsih",01722}, {"upsilon",01705},{"uuml",0374},
860    {"weierp",020430},{"xi",01676},  {"yacute",0375}, {"yen",0245},
861    {"yuml",0377},  {"zeta",01666},  {"zwj",020015},  {"zwnj",020014}
862 };
863 
864 
865 /*
866  * Comparison function for binary search
867  */
Html_entity_comp(const void * a,const void * b)868 static int Html_entity_comp(const void *a, const void *b)
869 {
870    return strcmp(((Ent_t *)a)->entity, ((Ent_t *)b)->entity);
871 }
872 
873 /*
874  * Binary search of 'key' in entity list
875  */
Html_entity_search(char * key)876 static int Html_entity_search(char *key)
877 {
878    Ent_t *res, EntKey;
879 
880    EntKey.entity = key;
881    res = (Ent_t*) bsearch(&EntKey, Entities, NumEnt,
882                           sizeof(Ent_t), Html_entity_comp);
883    if (res)
884      return (res - Entities);
885    return -1;
886 }
887 
888 /*
889  * This is M$ non-standard "smart quotes" (w1252). Now even deprecated by them!
890  *
891  * SGML for HTML4.01 defines c >= 128 and c <= 159 as UNUSED.
892  * TODO: Probably I should remove this hack, and add a HTML warning. --Jcid
893  */
Html_ms_stupid_quotes_2ucs(int isocode)894 static int Html_ms_stupid_quotes_2ucs(int isocode)
895 {
896    int ret;
897    switch (isocode) {
898    case 145:
899    case 146: ret = '\''; break;
900    case 147:
901    case 148: ret = '"'; break;
902    case 149: ret = 176; break;
903    case 150:
904    case 151: ret = '-'; break;
905    default:  ret = isocode; break;
906    }
907    return ret;
908 }
909 
910 /*
911  * Given an entity, return the UCS character code.
912  * Returns a negative value (error code) if not a valid entity.
913  *
914  * The first character *token is assumed to be == '&'
915  *
916  * For valid entities, *entsize is set to the length of the parsed entity.
917  */
Html_parse_entity(DilloHtml * html,const char * token,int toksize,int * entsize)918 static int Html_parse_entity(DilloHtml *html, const char *token,
919                              int toksize, int *entsize)
920 {
921    int isocode, i;
922    char *tok, *s, c;
923 
924    token++;
925    tok = s = toksize ? dStrndup(token, (uint_t)toksize) : dStrdup(token);
926 
927    isocode = -1;
928 
929    if (*s == '#') {
930       /* numeric character reference */
931       errno = 0;
932       if (*++s == 'x' || *s == 'X') {
933          if (isxdigit(*++s)) {
934             /* strtol with base 16 accepts leading "0x" - we don't */
935             if (*s == '0' && s[1] == 'x') {
936                s++;
937                isocode = 0;
938             } else {
939                isocode = strtol(s, &s, 16);
940             }
941          }
942       } else if (isdigit(*s)) {
943          isocode = strtol(s, &s, 10);
944       }
945 
946       if (!isocode || errno || isocode > 0xffff) {
947          /* this catches null bytes, errors and codes >= 0xFFFF */
948          BUG_MSG("Numeric character reference \"%s\" out of range.", tok);
949          isocode = -2;
950       }
951 
952       if (isocode != -1) {
953          if (*s == ';')
954             s++;
955          else if (prefs.show_extra_warnings)
956             BUG_MSG("Numeric character reference without trailing ';'.");
957       }
958 
959    } else if (isalpha(*s)) {
960       /* character entity reference */
961       while (*++s && (isalnum(*s) || strchr(":_.-", *s))) ;
962       c = *s;
963       *s = 0;
964 
965       if ((i = Html_entity_search(tok)) >= 0) {
966          isocode = Entities[i].isocode;
967       } else {
968          if (html->DocType == DT_XHTML && !strcmp(tok, "apos")) {
969             isocode = 0x27;
970          } else {
971             if ((html->DocType == DT_HTML && html->DocTypeVersion == 4.01f) ||
972                 html->DocType == DT_XHTML)
973                BUG_MSG("Undefined character entity '%s'.", tok);
974             isocode = -3;
975          }
976       }
977       if (c == ';')
978          s++;
979       else if (prefs.show_extra_warnings)
980          BUG_MSG("Character entity reference without trailing ';'.");
981    }
982 
983    *entsize = s-tok+1;
984    dFree(tok);
985 
986    if (isocode >= 145 && isocode <= 151) {
987       /* TODO: remove this hack. */
988       isocode = Html_ms_stupid_quotes_2ucs(isocode);
989    } else if (isocode == -1 && prefs.show_extra_warnings)
990       BUG_MSG("Literal '&'.");
991 
992    return isocode;
993 }
994 
995 /*
996  * Convert all the entities in a token to utf8 encoding. Takes
997  * a token and its length, and returns a newly allocated string.
998  */
a_Html_parse_entities(DilloHtml * html,const char * token,int toksize)999 char *a_Html_parse_entities(DilloHtml *html, const char *token, int toksize)
1000 {
1001    const char *esc_set = "&";
1002    char *new_str, buf[4];
1003    int i, j, k, n, s, isocode, entsize;
1004 
1005    new_str = dStrndup(token, toksize);
1006    s = strcspn(new_str, esc_set);
1007    if (new_str[s] == 0)
1008       return new_str;
1009 
1010    for (i = j = s; i < toksize; i++) {
1011       if (token[i] == '&' &&
1012           (isocode = Html_parse_entity(html, token+i,
1013                                        toksize-i, &entsize)) >= 0) {
1014          if (isocode >= 128) {
1015             /* multibyte encoding */
1016             n = a_Utf8_encode(isocode, buf);
1017             for (k = 0; k < n; ++k)
1018                new_str[j++] = buf[k];
1019          } else {
1020             new_str[j++] = (char) isocode;
1021          }
1022          i += entsize-1;
1023       } else {
1024          new_str[j++] = token[i];
1025       }
1026    }
1027    new_str[j] = '\0';
1028    return new_str;
1029 }
1030 
1031 /*
1032  * For white-space: pre-line, we must break the line if encountering a newline.
1033  * Otherwise, collapse whitespace as usual.
1034  */
Html_process_space_pre_line(DilloHtml * html,const char * space,int spacesize)1035 static void Html_process_space_pre_line(DilloHtml *html, const char *space,
1036                                         int spacesize)
1037 {
1038    int i, breakCnt = 0;
1039 
1040    for (i = 0; i < spacesize; i++) {
1041       /* Support for "\r", "\n" and "\r\n" line breaks */
1042       if (space[i] == '\r' || (space[i] == '\n' && !html->PrevWasCR)) {
1043          breakCnt++;
1044          html->PrevWasCR = (space[i] == '\r');
1045 
1046          HT2TB(html)->addLinebreak (html->wordStyle ());
1047       }
1048    }
1049    if (breakCnt == 0) {
1050       HT2TB(html)->addSpace(html->wordStyle ());
1051    }
1052 }
1053 
1054 /*
1055  * Parse spaces
1056  */
Html_process_space(DilloHtml * html,const char * space,int spacesize)1057 static void Html_process_space(DilloHtml *html, const char *space,
1058                                int spacesize)
1059 {
1060    char *spc;
1061    int i, offset;
1062    DilloHtmlParseMode parse_mode = S_TOP(html)->parse_mode;
1063 
1064    if (S_TOP(html)->display_none) {
1065       /* do nothing */
1066    } else if (parse_mode == DILLO_HTML_PARSE_MODE_STASH) {
1067       html->StashSpace = (html->Stash->len > 0);
1068 
1069    } else if (parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
1070       dStr_append_l(html->Stash, space, spacesize);
1071 
1072    } else if (parse_mode == DILLO_HTML_PARSE_MODE_PRE) {
1073       int spaceCnt = 0;
1074 
1075       /* re-scan the string for characters that cause line breaks */
1076       for (i = 0; i < spacesize; i++) {
1077          /* Support for "\r", "\n" and "\r\n" line breaks (skips the first) */
1078          if (!html->PreFirstChar &&
1079              (space[i] == '\r' || (space[i] == '\n' && !html->PrevWasCR))) {
1080 
1081             if (spaceCnt) {
1082                spc = dStrnfill(spaceCnt, ' ');
1083                HT2TB(html)->addText (spc, spaceCnt, html->wordStyle ());
1084                dFree(spc);
1085                spaceCnt = 0;
1086             }
1087             HT2TB(html)->addLinebreak (html->wordStyle ());
1088             html->pre_column = 0;
1089          }
1090          html->PreFirstChar = false;
1091 
1092          /* cr and lf should not be rendered -- they appear as a break */
1093          switch (space[i]) {
1094          case '\r':
1095          case '\n':
1096             break;
1097          case '\t':
1098             if (prefs.show_extra_warnings)
1099                BUG_MSG("TAB character inside <pre>.");
1100             offset = TAB_SIZE - html->pre_column % TAB_SIZE;
1101             spaceCnt += offset;
1102             html->pre_column += offset;
1103             break;
1104          default:
1105             spaceCnt++;
1106             html->pre_column++;
1107             break;
1108          }
1109 
1110          html->PrevWasCR = (space[i] == '\r');
1111       }
1112 
1113       if (spaceCnt) {
1114          // add break possibility for the white-space:pre-wrap case
1115          HT2TB(html)->addBreakOption (html->wordStyle (), false);
1116          spc = dStrnfill(spaceCnt, ' ');
1117          HT2TB(html)->addText (spc, spaceCnt, html->wordStyle ());
1118          dFree(spc);
1119       }
1120 
1121    } else {
1122       if (SGML_SPCDEL) {
1123          /* SGML_SPCDEL ignores white space immediately after an open tag */
1124       } else if (html->wordStyle ()->whiteSpace == WHITE_SPACE_PRE_LINE) {
1125          Html_process_space_pre_line(html, space, spacesize);
1126       } else {
1127          HT2TB(html)->addSpace(html->wordStyle ());
1128       }
1129 
1130       if (parse_mode == DILLO_HTML_PARSE_MODE_STASH_AND_BODY)
1131          html->StashSpace = (html->Stash->len > 0);
1132    }
1133 }
1134 
1135 /*
1136  * Handles putting the word into its proper place
1137  *  > STASH and VERBATIM --> html->Stash
1138  *  > otherwise it goes through addText()
1139  *
1140  * Entities are parsed (or not) according to parse_mode.
1141  * 'word' is a '\0'-terminated string.
1142  */
Html_process_word(DilloHtml * html,const char * word,int size)1143 static void Html_process_word(DilloHtml *html, const char *word, int size)
1144 {
1145    int i, j, start;
1146    char *Pword;
1147    DilloHtmlParseMode parse_mode = S_TOP(html)->parse_mode;
1148 
1149    if (S_TOP(html)->display_none)
1150       return;
1151 
1152    if (parse_mode == DILLO_HTML_PARSE_MODE_STASH ||
1153        parse_mode == DILLO_HTML_PARSE_MODE_STASH_AND_BODY) {
1154       if (html->StashSpace) {
1155          dStr_append_c(html->Stash, ' ');
1156          html->StashSpace = false;
1157       }
1158       Pword = a_Html_parse_entities(html, word, size);
1159       dStr_append(html->Stash, Pword);
1160       dFree(Pword);
1161 
1162    } else if (parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
1163       /* word goes in untouched, it is not processed here. */
1164       dStr_append_l(html->Stash, word, size);
1165    }
1166 
1167    if (parse_mode == DILLO_HTML_PARSE_MODE_STASH ||
1168        parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
1169       /* skip until the closing instructions */
1170 
1171    } else if (parse_mode == DILLO_HTML_PARSE_MODE_PRE) {
1172       /* all this overhead is to catch white-space entities */
1173       Pword = a_Html_parse_entities(html, word, size);
1174       for (start = i = 0; Pword[i]; start = i)
1175          if (isspace(Pword[i])) {
1176             while (Pword[++i] && isspace(Pword[i])) ;
1177             Html_process_space(html, Pword + start, i - start);
1178          } else {
1179             while (Pword[++i] && !isspace(Pword[i])) ;
1180             HT2TB(html)->addText(Pword + start, i - start, html->wordStyle ());
1181             html->pre_column += i - start;
1182             html->PreFirstChar = false;
1183          }
1184       dFree(Pword);
1185 
1186    } else {
1187       const char *word2, *beyond_word2;
1188 
1189       Pword = NULL;
1190       if (!memchr(word,'&', size)) {
1191          /* No entities */
1192          word2 = word;
1193          beyond_word2 = word + size;
1194       } else {
1195          /* Collapse white-space entities inside the word (except &nbsp;) */
1196          Pword = a_Html_parse_entities(html, word, size);
1197          /* Collapse adjacent " \t\f\n\r" characters into a single space */
1198          for (i = j = 0; (Pword[i] = Pword[j]); ++i, ++j) {
1199             if (strchr(" \t\f\n\r", Pword[i])) {
1200                if (i == 0 || (i > 0 && Pword[i-1] != ' '))
1201                   Pword[i] = ' ';
1202                else
1203                   for (--i; Pword[j+1] && strchr(" \t\f\n\r", Pword[j+1]); ++j)
1204                      ;
1205             }
1206          }
1207          word2 = Pword;
1208          beyond_word2 = word2 + strlen(word2);
1209       }
1210       for (start = i = 0; word2[i]; start = i) {
1211          int len;
1212 
1213          if (isspace(word2[i])) {
1214             while (word2[++i] && isspace(word2[i])) ;
1215             Html_process_space(html, word2 + start, i - start);
1216          } else if (!strncmp(word2+i, utf8_zero_width_space, 3)) {
1217             i += 3;
1218             HT2TB(html)->addBreakOption(html->wordStyle (), false);
1219          } else if (a_Utf8_ideographic(word2+i, beyond_word2, &len)) {
1220             i += len;
1221             HT2TB(html)->addText(word2 + start, i - start, html->wordStyle ());
1222             HT2TB(html)->addBreakOption(html->wordStyle (), false);
1223          } else {
1224             do {
1225                i += len;
1226             } while (word2[i] && !isspace(word2[i]) &&
1227                      strncmp(word2+i, utf8_zero_width_space, 3) &&
1228                      (!a_Utf8_ideographic(word2+i, beyond_word2, &len)));
1229             HT2TB(html)->addText(word2 + start, i - start, html->wordStyle ());
1230          }
1231       }
1232       if (Pword == word2)
1233          dFree(Pword);
1234    }
1235 }
1236 
1237 /*
1238  * Does the tag in tagstr (e.g. "p") match the tag in the tag, tagsize
1239  * structure, with the initial < skipped over (e.g. "P align=center>")?
1240  */
Html_match_tag(const char * tagstr,char * tag,int tagsize)1241 static bool Html_match_tag(const char *tagstr, char *tag, int tagsize)
1242 {
1243    int i;
1244 
1245    for (i = 0; i < tagsize && tagstr[i] != '\0'; i++) {
1246       if (D_ASCII_TOLOWER(tagstr[i]) != D_ASCII_TOLOWER(tag[i]))
1247          return false;
1248    }
1249    /* The test for '/' is for xml compatibility: "empty/>" will be matched. */
1250    if (i < tagsize && (isspace(tag[i]) || tag[i] == '>' || tag[i] == '/'))
1251       return true;
1252    return false;
1253 }
1254 
1255 /*
1256  * This function is called after popping the stack, to
1257  * handle nested Textblock widgets.
1258  */
Html_eventually_pop_dw(DilloHtml * html,bool hand_over_break)1259 static void Html_eventually_pop_dw(DilloHtml *html, bool hand_over_break)
1260 {
1261    if (html->dw != S_TOP(html)->textblock) {
1262       if (hand_over_break)
1263          HT2TB(html)->handOverBreak (html->style ());
1264       HT2TB(html)->flush ();
1265       html->dw = S_TOP(html)->textblock;
1266    }
1267 }
1268 
1269 /*
1270  * Push the tag (copying attributes from the top of the stack)
1271  */
Html_push_tag(DilloHtml * html,int tag_idx)1272 static void Html_push_tag(DilloHtml *html, int tag_idx)
1273 {
1274    int n_items;
1275 
1276    n_items = html->stack->size ();
1277    html->stack->increase ();
1278    /* We'll copy the former stack item and just change the tag and its index
1279     * instead of copying all fields except for tag.  --Jcid */
1280    *html->stack->getRef(n_items) = *html->stack->getRef(n_items - 1);
1281    html->stack->getRef(n_items)->tag_idx = tag_idx;
1282    html->dw = S_TOP(html)->textblock;
1283 }
1284 
1285 /*
1286  * Push the tag (used to force en element with optional open into the stack)
1287  * Note: now it's the same as Html_push_tag(), but things may change...
1288  */
Html_force_push_tag(DilloHtml * html,int tag_idx)1289 static void Html_force_push_tag(DilloHtml *html, int tag_idx)
1290 {
1291    html->startElement (tag_idx);
1292    Html_push_tag(html, tag_idx);
1293 }
1294 
1295 /*
1296  * Pop the top tag in the stack
1297  */
Html_real_pop_tag(DilloHtml * html)1298 static void Html_real_pop_tag(DilloHtml *html)
1299 {
1300    bool hand_over_break;
1301 
1302    html->styleEngine->endElement (S_TOP(html)->tag_idx);
1303    hand_over_break = S_TOP(html)->hand_over_break;
1304    html->stack->setSize (html->stack->size() - 1);
1305    Html_eventually_pop_dw(html, hand_over_break);
1306 }
1307 
1308 /*
1309  * Cleanup the stack to a given index.
1310  */
Html_tag_cleanup_to_idx(DilloHtml * html,int idx)1311 static void Html_tag_cleanup_to_idx(DilloHtml *html, int idx)
1312 {
1313    int s_sz;
1314    while ((s_sz = html->stack->size()) > idx) {
1315       int toptag_idx = S_TOP(html)->tag_idx;
1316       TagInfo toptag = Tags[toptag_idx];
1317       if (s_sz > idx + 1 && toptag.EndTag != 'O')
1318          BUG_MSG("  - forcing close of open tag: <%s>.", toptag.name);
1319       _MSG("Close: %*s%s\n", size," ", toptag.name);
1320       if (toptag.close)
1321          toptag.close(html);
1322       Html_real_pop_tag(html);
1323    }
1324 }
1325 
1326 /*
1327  * Default close function for tags.
1328  * (conditional cleanup of the stack)
1329  * There are several ways of doing it. Considering the HTML 4.01 spec
1330  * which defines optional close tags, and the will to deliver useful diagnose
1331  * messages for bad-formed HTML, it'll go as follows:
1332  *   1.- Search the stack for the first tag that requires a close tag.
1333  *   2.- If it matches, clean all the optional-close tags in between.
1334  *   3.- Cleanup the matching tag. (on error, give a warning message)
1335  *
1336  * If 'w3c_mode' is NOT enabled:
1337  *   1.- Search the stack for a matching tag based on tag level.
1338  *   2.- If it exists, clean all the tags in between.
1339  *   3.- Cleanup the matching tag. (on error, give a warning message)
1340  */
Html_tag_cleanup_at_close(DilloHtml * html,int new_idx)1341 static void Html_tag_cleanup_at_close(DilloHtml *html, int new_idx)
1342 {
1343    static int i_BUTTON = a_Html_tag_index("button"),
1344               i_SELECT = a_Html_tag_index("select"),
1345               i_TEXTAREA = a_Html_tag_index("textarea");
1346    int w3c_mode = !prefs.w3c_plus_heuristics;
1347    int stack_idx, tag_idx, matched = 0, expected = 0;
1348    TagInfo new_tag = Tags[new_idx];
1349 
1350    /* Look for the candidate tag to close */
1351    stack_idx = html->stack->size();
1352    while (--stack_idx) {
1353       tag_idx = html->stack->getRef(stack_idx)->tag_idx;
1354       if (tag_idx == new_idx) {
1355          /* matching tag found */
1356          matched = 1;
1357          break;
1358       } else if (Tags[tag_idx].EndTag == 'O') {
1359          /* skip an optional tag */
1360          continue;
1361       } else if ((new_idx == i_BUTTON && html->InFlags & IN_BUTTON) ||
1362                  (new_idx == i_SELECT && html->InFlags & IN_SELECT) ||
1363                  (new_idx == i_TEXTAREA && html->InFlags & IN_TEXTAREA)) {
1364          /* let these elements close tags inside them */
1365          continue;
1366       } else if (w3c_mode || Tags[tag_idx].TagLevel >= new_tag.TagLevel) {
1367          /* this is the tag that should have been closed */
1368          expected = 1;
1369          break;
1370       }
1371    }
1372 
1373    if (matched) {
1374       Html_tag_cleanup_to_idx(html, stack_idx);
1375    } else if (expected) {
1376       BUG_MSG("Unexpected closing tag: </%s> -- expected </%s>.",
1377               new_tag.name, Tags[tag_idx].name);
1378    } else {
1379       BUG_MSG("Unexpected closing tag: </%s>.", new_tag.name);
1380    }
1381 }
1382 
1383 /*
1384  * Avoid nesting and inter-nesting of BUTTON, SELECT and TEXTAREA,
1385  * by closing them before opening another.
1386  * This is not an HTML SPEC restriction , but it avoids lots of trouble
1387  * inside dillo (concurrent inputs), and makes almost no sense to have.
1388  */
Html_tag_cleanup_nested_inputs(DilloHtml * html,int new_idx)1389 static void Html_tag_cleanup_nested_inputs(DilloHtml *html, int new_idx)
1390 {
1391    static int i_BUTTON = a_Html_tag_index("button"),
1392               i_SELECT = a_Html_tag_index("select"),
1393               i_TEXTAREA = a_Html_tag_index("textarea");
1394    int stack_idx, u_idx, matched = 0;
1395 
1396    dReturn_if_fail(html->InFlags & (IN_BUTTON | IN_SELECT | IN_TEXTAREA));
1397    dReturn_if_fail(new_idx == i_BUTTON || new_idx == i_SELECT ||
1398                    new_idx == i_TEXTAREA);
1399 
1400    /* Get the unclosed tag index */
1401    u_idx = (html->InFlags & IN_BUTTON) ? i_BUTTON :
1402                  (html->InFlags & IN_SELECT) ? i_SELECT : i_TEXTAREA;
1403 
1404    /* Look for it inside the stack */
1405    stack_idx = html->stack->size();
1406    while (--stack_idx) {
1407       if (html->stack->getRef(stack_idx)->tag_idx == u_idx) {
1408          /* matching tag found */
1409          matched = 1;
1410          break;
1411       }
1412    }
1413 
1414    if (matched) {
1415       BUG_MSG("Attempt to nest <%s> element inside <%s> -- closing <%s>.",
1416               Tags[new_idx].name, Tags[u_idx].name, Tags[u_idx].name);
1417       Html_tag_cleanup_to_idx(html, stack_idx);
1418    } else {
1419       MSG_WARN("Inconsistent parser state, flag is SET but no '%s' element"
1420                "was found in the stack\n", Tags[u_idx].name);
1421    }
1422 
1423    html->InFlags &= ~(IN_BUTTON | IN_SELECT | IN_TEXTAREA);
1424 }
1425 
1426 
1427 /*
1428  * Some parsing routines.
1429  */
1430 
1431 /*
1432  * Used by a_Html_parse_length
1433  */
Html_parse_length_or_multi_length(const char * attr,char ** endptr)1434 static CssLength Html_parse_length_or_multi_length (const char *attr,
1435                                                     char **endptr)
1436 {
1437    CssLength l;
1438    double v;
1439    char *end;
1440 
1441    v = strtod (attr, &end);
1442    switch (*end) {
1443    case '%':
1444       end++;
1445       l = CSS_CREATE_LENGTH (v / 100, CSS_LENGTH_TYPE_PERCENTAGE);
1446       break;
1447 
1448    case '*':
1449       end++;
1450       l = CSS_CREATE_LENGTH (v, CSS_LENGTH_TYPE_RELATIVE);
1451       break;
1452 /*
1453    The "px" suffix seems not allowed by HTML4.01 SPEC.
1454    case 'p':
1455       if (end[1] == 'x')
1456          end += 2;
1457 */
1458    default:
1459       l = CSS_CREATE_LENGTH (v, CSS_LENGTH_TYPE_PX);
1460       break;
1461    }
1462 
1463    if (endptr)
1464       *endptr = end;
1465    return l;
1466 }
1467 
1468 
1469 /*
1470  * Returns a length or a percentage, or UNDEF_LENGTH in case
1471  * of an error, or if attr is NULL.
1472  */
a_Html_parse_length(DilloHtml * html,const char * attr)1473 CssLength a_Html_parse_length (DilloHtml *html, const char *attr)
1474 {
1475    CssLength l;
1476    char *end;
1477 
1478    l = Html_parse_length_or_multi_length (attr, &end);
1479    if (CSS_LENGTH_TYPE (l) == CSS_LENGTH_TYPE_RELATIVE)
1480       /* not allowed as &Length; */
1481       l = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
1482    else {
1483       /* allow only whitespaces */
1484       if (*end && !isspace (*end)) {
1485          BUG_MSG("Garbage after length: '%s'.", attr);
1486          l = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
1487       }
1488    }
1489 
1490    _MSG("a_Html_parse_length: \"%s\" %d\n", attr, CSS_LENGTH_VALUE(l));
1491    return l;
1492 }
1493 
1494 /*
1495  * Parse a color attribute.
1496  * Return value: parsed color, or default_color (+ error msg) on error.
1497  */
a_Html_color_parse(DilloHtml * html,const char * str,int32_t default_color)1498 int32_t a_Html_color_parse(DilloHtml *html, const char *str,
1499                            int32_t default_color)
1500 {
1501    int err = 1;
1502    int32_t color = a_Color_parse(str, default_color, &err);
1503 
1504    if (err) {
1505       BUG_MSG("Color '%s' is not in \"#RRGGBB\" format.", str);
1506    }
1507    return color;
1508 }
1509 
1510 /*
1511  * Check that 'val' is composed of characters inside [A-Za-z0-9:_.-]
1512  * Note: ID can't have entities, but this check is enough (no '&').
1513  * Return value: 1 if OK, 0 otherwise.
1514  */
1515 static int
Html_check_name_val(DilloHtml * html,const char * val,const char * attrname)1516  Html_check_name_val(DilloHtml *html, const char *val, const char *attrname)
1517 {
1518    if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) {
1519       bool valid = *val && !strchr(val, ' ');
1520 
1521       if (!valid) {
1522          BUG_MSG("'%s' value \"%s\" must not be empty and must not contain "
1523                  "spaces.", attrname, val);
1524       }
1525       return valid ? 1 : 0;
1526    } else {
1527       int i;
1528 
1529       for (i = 0; val[i]; ++i)
1530          if (!isascii(val[i]) || !(isalnum(val[i]) || strchr(":_.-", val[i])))
1531             break;
1532 
1533       if (val[i] || !(isascii(val[0]) && isalpha(val[0])))
1534          BUG_MSG("%s attribute value \"%s\" is not of the form "
1535                  "'[A-Za-z][A-Za-z0-9:_.-]*'.", attrname, val);
1536 
1537       return !(val[i]);
1538    }
1539 }
1540 
1541 /*
1542  * Handle DOCTYPE declaration
1543  *
1544  * Follows the convention that HTML 4.01
1545  * doctypes which include a full w3c DTD url are treated as
1546  * standards-compliant, but 4.01 without the url and HTML 4.0 and
1547  * earlier are not. XHTML doctypes are always standards-compliant
1548  * whether or not an url is present.
1549  *
1550  * Note: I'm not sure about this convention. The W3C validator
1551  * recognizes the "HTML Level" with or without the URL. The convention
1552  * comes from mozilla (see URLs below), but Dillo doesn't have the same
1553  * rendering modes, so it may be better to chose another behaviour. --Jcid
1554  *
1555  * http://www.mozilla.org/docs/web-developer/quirks/doctypes.html
1556  * http://lists.auriga.wearlab.de/pipermail/dillo-dev/2004-October/002300.html
1557  *
1558  * This is not a full DOCTYPE parser, just enough for what Dillo uses.
1559  */
Html_parse_doctype(DilloHtml * html,const char * tag,int tagsize)1560 static void Html_parse_doctype(DilloHtml *html, const char *tag, int tagsize)
1561 {
1562    static const char HTML_SGML_sig [] = "<!DOCTYPE HTML PUBLIC ";
1563    static const char HTML20     [] = "-//IETF//DTD HTML";
1564    static const char HTML32     [] = "-//W3C//DTD HTML 3.2";
1565    static const char HTML40     [] = "-//W3C//DTD HTML 4.0";
1566    static const char HTML401    [] = "-//W3C//DTD HTML 4.01";
1567    static const char HTML401_url[] = "http://www.w3.org/TR/html4/";
1568    static const char XHTML1     [] = "-//W3C//DTD XHTML 1.0";
1569    static const char XHTML1_url [] = "http://www.w3.org/TR/xhtml1/DTD/";
1570    static const char XHTML11    [] = "-//W3C//DTD XHTML 1.1";
1571    static const char XHTML11_url[] = "http://www.w3.org/TR/xhtml11/DTD/";
1572 
1573    size_t i;
1574    int quote;
1575    char *p, *ntag = dStrndup(tag, tagsize);
1576 
1577    /* Tag sanitization: Collapse whitespace between tokens
1578     * and replace '\n' and '\r' with ' ' inside quoted strings. */
1579    for (i = 0, p = ntag; *p; ++p) {
1580       if (isspace(*p)) {
1581          for (ntag[i++] = ' '; isspace(p[1]); ++p) ;
1582       } else if ((quote = *p) == '"' || *p == '\'') {
1583          for (ntag[i++] = *p++; (ntag[i] = *p) && ntag[i++] != quote; ++p) {
1584             if (*p == '\n' || *p == '\r')
1585                ntag[i - 1] = ' ';
1586             p += (p[0] == '\r' && p[1] == '\n') ? 1 : 0;
1587          }
1588       } else {
1589          ntag[i++] = *p;
1590       }
1591       if (!*p)
1592          break;
1593    }
1594    ntag[i] = 0;
1595 
1596    _MSG("New: {%s}\n", ntag);
1597 
1598    if (html->DocType != DT_NONE)
1599       BUG_MSG("Multiple DOCTYPE declarations.");
1600 
1601    /* The default DT_NONE type is TagSoup */
1602    if (i > strlen(HTML_SGML_sig) && // avoid out of bounds reads!
1603        !dStrnAsciiCasecmp(ntag, HTML_SGML_sig, strlen(HTML_SGML_sig))) {
1604       p = ntag + strlen(HTML_SGML_sig) + 1;
1605       if (!strncmp(p, HTML401, strlen(HTML401)) &&
1606           dStriAsciiStr(p + strlen(HTML401), HTML401_url)) {
1607          html->DocType = DT_HTML;
1608          html->DocTypeVersion = 4.01f;
1609       } else if (!strncmp(p, XHTML1, strlen(XHTML1)) &&
1610                  dStriAsciiStr(p + strlen(XHTML1), XHTML1_url)) {
1611          html->DocType = DT_XHTML;
1612          html->DocTypeVersion = 1.0f;
1613       } else if (!strncmp(p, XHTML11, strlen(XHTML11)) &&
1614                  dStriAsciiStr(p + strlen(XHTML11), XHTML11_url)) {
1615          html->DocType = DT_XHTML;
1616          html->DocTypeVersion = 1.1f;
1617       } else if (!strncmp(p, HTML40, strlen(HTML40))) {
1618          html->DocType = DT_HTML;
1619          html->DocTypeVersion = 4.0f;
1620       } else if (!strncmp(p, HTML32, strlen(HTML32))) {
1621          html->DocType = DT_HTML;
1622          html->DocTypeVersion = 3.2f;
1623       } else if (!strncmp(p, HTML20, strlen(HTML20))) {
1624          html->DocType = DT_HTML;
1625          html->DocTypeVersion = 2.0f;
1626       }
1627    } else if (!dStrAsciiCasecmp(ntag, "<!DOCTYPE html>") ||
1628               !dStrAsciiCasecmp(ntag, "<!DOCTYPE html >") ||
1629               !dStrAsciiCasecmp(ntag,
1630                            "<!DOCTYPE html SYSTEM \"about:legacy-compat\">") ||
1631               !dStrAsciiCasecmp(ntag,
1632                              "<!DOCTYPE html SYSTEM 'about:legacy-compat'>")) {
1633       html->DocType = DT_HTML;
1634       html->DocTypeVersion = 5.0f;
1635    }
1636    if (html->DocType == DT_NONE) {
1637       html->DocType = DT_UNRECOGNIZED;
1638       BUG_MSG("DOCTYPE not recognized: ('%s').", ntag);
1639    }
1640    dFree(ntag);
1641 }
1642 
1643 /*
1644  * Handle open HTML element
1645  */
Html_tag_open_html(DilloHtml * html,const char * tag,int tagsize)1646 static void Html_tag_open_html(DilloHtml *html, const char *tag, int tagsize)
1647 {
1648    /* The IN_HTML flag will be kept set until at IN_EOF condition.
1649     * This allows to handle pages with multiple or uneven HTML tags */
1650 
1651    if (!(html->InFlags & IN_HTML))
1652       html->InFlags |= IN_HTML;
1653    if (html->Num_HTML < UCHAR_MAX)
1654       ++html->Num_HTML;
1655 
1656    if (html->Num_HTML > 1) {
1657       BUG_MSG("<html> was already open.");
1658       html->ReqTagClose = true;
1659    }
1660 }
1661 
1662 /*
1663  * Handle close HTML element
1664  */
Html_tag_close_html(DilloHtml * html)1665 static void Html_tag_close_html(DilloHtml *html)
1666 {
1667    _MSG("Html_tag_close_html: Num_HTML=%d\n", html->Num_HTML);
1668 }
1669 
1670 /*
1671  * Handle open HEAD element
1672  */
Html_tag_open_head(DilloHtml * html,const char * tag,int tagsize)1673 static void Html_tag_open_head(DilloHtml *html, const char *tag, int tagsize)
1674 {
1675    if (html->InFlags & IN_BODY) {
1676       BUG_MSG("<head> must go before the BODY section.");
1677       html->ReqTagClose = true;
1678       return;
1679    }
1680 
1681    if (html->Num_HEAD < UCHAR_MAX)
1682       ++html->Num_HEAD;
1683    if (html->InFlags & IN_HEAD) {
1684       BUG_MSG("<head> was already open.");
1685       html->ReqTagClose = true;
1686    } else if (html->Num_HEAD > 1) {
1687       BUG_MSG("<head> already finished -- ignoring.");
1688       html->ReqTagClose = true;
1689    } else {
1690       html->InFlags |= IN_HEAD;
1691    }
1692 }
1693 
1694 /*
1695  * Handle close HEAD element
1696  * Note: HEAD is parsed once completely got.
1697  */
Html_tag_close_head(DilloHtml * html)1698 static void Html_tag_close_head(DilloHtml *html)
1699 {
1700    if (html->InFlags & IN_HEAD) {
1701       if (html->Num_HEAD == 1) {
1702          /* match for the well formed start of HEAD section */
1703          if (html->Num_TITLE == 0)
1704             BUG_MSG("<head> lacks <title>.");
1705 
1706          html->InFlags &= ~IN_HEAD;
1707 
1708          /* charset is already set, load remote stylesheets now */
1709          for (int i = 0; i < html->cssUrls->size(); i++) {
1710             a_Html_load_stylesheet(html, html->cssUrls->get(i));
1711          }
1712       } else if (html->Num_HEAD > 1) {
1713          --html->Num_HEAD;
1714       }
1715    } else {
1716       /* not reached, see Html_tag_cleanup_at_close() */
1717    }
1718 }
1719 
1720 /*
1721  * Handle open TITLE
1722  * calls stash init, where the title string will be stored
1723  */
Html_tag_open_title(DilloHtml * html,const char * tag,int tagsize)1724 static void Html_tag_open_title(DilloHtml *html, const char *tag, int tagsize)
1725 {
1726    /* fill the stash buffer so TITLE content can be ignored
1727     * when not valid, redundant or outside HEAD section */
1728    a_Html_stash_init(html);
1729 
1730    if (html->InFlags & IN_HEAD) {
1731       if (html->Num_TITLE < UCHAR_MAX)
1732          ++html->Num_TITLE;
1733       if (html->Num_TITLE > 1)
1734          BUG_MSG("Redundant <title>.");
1735    } else {
1736       BUG_MSG("<title> must be inside <head> -- ignoring.");
1737    }
1738 }
1739 
1740 /*
1741  * Handle close TITLE
1742  * set page-title in the browser window and in the history.
1743  */
Html_tag_close_title(DilloHtml * html)1744 static void Html_tag_close_title(DilloHtml *html)
1745 {
1746    if (html->InFlags & IN_HEAD && html->Num_TITLE == 1) {
1747       /* title is only valid inside HEAD */
1748       a_UIcmd_set_page_title(html->bw, html->Stash->str);
1749       a_History_set_title_by_url(html->page_url, html->Stash->str);
1750    }
1751 }
1752 
1753 /*
1754  * Handle open SCRIPT
1755  * initializes stash, where the embedded code will be stored.
1756  * MODE_VERBATIM is used because MODE_STASH catches entities.
1757  */
Html_tag_open_script(DilloHtml * html,const char * tag,int tagsize)1758 static void Html_tag_open_script(DilloHtml *html, const char *tag, int tagsize)
1759 {
1760    a_Html_stash_init(html);
1761    S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_VERBATIM;
1762 }
1763 
1764 /*
1765  * Handle close SCRIPT
1766  */
Html_tag_close_script(DilloHtml * html)1767 static void Html_tag_close_script(DilloHtml *html)
1768 {
1769    /* eventually the stash will be sent to an interpreter for parsing */
1770 }
1771 
1772 /*
1773  * Handle open STYLE
1774  * Store contents in the stash where the style sheet interpreter can get it.
1775  */
Html_tag_open_style(DilloHtml * html,const char * tag,int tagsize)1776 static void Html_tag_open_style(DilloHtml *html, const char *tag, int tagsize)
1777 {
1778    const char *attrbuf;
1779 
1780    html->loadCssFromStash = true;
1781 
1782    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "type"))) {
1783       if (html->DocType != DT_HTML || html->DocTypeVersion <= 4.01f)
1784          BUG_MSG("<style> requires type attribute.");
1785    } else if (dStrAsciiCasecmp(attrbuf, "text/css")) {
1786       html->loadCssFromStash = false;
1787    }
1788    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "media")) &&
1789        dStrAsciiCasecmp(attrbuf, "all") && !dStriAsciiStr(attrbuf, "screen")) {
1790       /* HTML 4.01 sec. 6.13 says that media descriptors are case-sensitive,
1791        * but sec. 14.2.3 says that the attribute is case-insensitive.
1792        * TODO can be a comma-separated list.
1793        * TODO handheld.
1794        */
1795       html->loadCssFromStash = false;
1796    }
1797 
1798    a_Html_stash_init(html);
1799    S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_VERBATIM;
1800 }
1801 
1802 /*
1803  * Handle close STYLE
1804  */
Html_tag_close_style(DilloHtml * html)1805 static void Html_tag_close_style(DilloHtml *html)
1806 {
1807    if (prefs.parse_embedded_css && html->loadCssFromStash)
1808       html->styleEngine->parse(html, html->base_url, html->Stash->str,
1809                                html->Stash->len, CSS_ORIGIN_AUTHOR);
1810 }
1811 
1812 /*
1813  * <BODY>
1814  */
Html_tag_open_body(DilloHtml * html,const char * tag,int tagsize)1815 static void Html_tag_open_body(DilloHtml *html, const char *tag, int tagsize)
1816 {
1817    const char *attrbuf;
1818    int32_t color;
1819    int tag_index_a = a_Html_tag_index ("a");
1820    style::Color *bgColor;
1821    style::StyleImage *bgImage;
1822    style::BackgroundRepeat bgRepeat;
1823    style::BackgroundAttachment bgAttachment;
1824    style::Length bgPositionX, bgPositionY;
1825 
1826    _MSG("Html_tag_open_body Num_BODY=%d\n", html->Num_BODY);
1827    if (!(html->InFlags & IN_BODY))
1828       html->InFlags |= IN_BODY;
1829    if (html->Num_BODY < UCHAR_MAX)
1830       ++html->Num_BODY;
1831 
1832    if (html->Num_BODY > 1) {
1833       BUG_MSG("<body> was already open.");
1834       html->ReqTagClose = true;
1835       return;
1836    }
1837 
1838    if (html->InFlags & IN_HEAD) {
1839       /* if we're here, it's bad XHTML, no need to recover */
1840       BUG_MSG("Unclosed <head>.");
1841    }
1842 
1843    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "bgcolor"))) {
1844       color = a_Html_color_parse(html, attrbuf, -1);
1845 
1846       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
1847          BUG_MSG("<body> bgcolor attribute is obsolete.");
1848 
1849       if (color != -1)
1850          html->styleEngine->setNonCssHint (CSS_PROPERTY_BACKGROUND_COLOR,
1851                                            CSS_TYPE_COLOR, color);
1852    }
1853 
1854    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "text"))) {
1855       color = a_Html_color_parse(html, attrbuf, -1);
1856 
1857       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
1858          BUG_MSG("<body> text attribute is obsolete.");
1859 
1860       if (color != -1)
1861          html->styleEngine->setNonCssHint (CSS_PROPERTY_COLOR,
1862                                            CSS_TYPE_COLOR, color);
1863    }
1864 
1865    html->restyle ();
1866 
1867    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "link"))) {
1868       html->non_css_link_color = a_Html_color_parse(html, attrbuf, -1);
1869       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
1870          BUG_MSG("<body> link attribute is obsolete.");
1871    }
1872 
1873    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "vlink"))) {
1874       html->non_css_visited_color = a_Html_color_parse(html, attrbuf, -1);
1875       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
1876          BUG_MSG("<body> vlink attribute is obsolete.");
1877    }
1878 
1879    html->dw->setStyle (html->style ());
1880 
1881    bgColor = html->styleEngine->backgroundColor ();
1882    if (bgColor)
1883       HT2LT(html)->setBgColor(bgColor);
1884 
1885    bgImage = html->styleEngine->backgroundImage (&bgRepeat, &bgAttachment,
1886                                                  &bgPositionX, &bgPositionY);
1887    if (bgImage)
1888       HT2LT(html)->setBgImage(bgImage, bgRepeat, bgAttachment, bgPositionX,
1889                               bgPositionY);
1890 
1891    /* Determine a color for visited links.
1892     * This color is computed once per page and used for immediate feedback
1893     * when clicking a link.
1894     * On reload style including color for visited links is computed properly
1895     * according to CSS.
1896     */
1897    html->startElement (tag_index_a);
1898    html->styleEngine->setPseudoVisited ();
1899    if (html->non_css_visited_color != -1) {
1900       html->styleEngine->setNonCssHint (CSS_PROPERTY_COLOR, CSS_TYPE_COLOR,
1901                                         html->non_css_visited_color);
1902    }
1903    html->visited_color = html->style ()->color->getColor ();
1904    html->styleEngine->endElement (tag_index_a);
1905 
1906    if (prefs.contrast_visited_color) {
1907       /* get a color that has a "safe distance" from text, link and bg */
1908       html->visited_color =
1909          a_Color_vc(html->visited_color,
1910             html->style ()->color->getColor(),
1911             html->non_css_link_color,
1912             html->backgroundStyle()->backgroundColor->getColor());
1913    }
1914 
1915 
1916    S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_BODY;
1917 }
1918 
1919 /*
1920  * BODY
1921  */
Html_tag_close_body(DilloHtml * html)1922 static void Html_tag_close_body(DilloHtml *html)
1923 {
1924    /* Some tag soup pages use multiple BODY tags...
1925     * Defer clearing the IN_BODY flag until IN_EOF */
1926 }
1927 
1928 /*
1929  * <P>
1930  * TODO: what's the point between adding the parbreak before and
1931  *       after the push?
1932  */
Html_tag_open_p(DilloHtml * html,const char * tag,int tagsize)1933 static void Html_tag_open_p(DilloHtml *html, const char *tag, int tagsize)
1934 {
1935    CssPropertyList props;
1936 
1937    a_Html_tag_set_align_attr (html, tag, tagsize);
1938 }
1939 
1940 /*
1941  * <FRAME>, <IFRAME>
1942  * TODO: This is just a temporary fix while real frame support
1943  *       isn't finished. Imitates lynx/w3m's frames.
1944  */
Html_tag_open_frame(DilloHtml * html,const char * tag,int tagsize)1945 static void Html_tag_open_frame (DilloHtml *html, const char *tag, int tagsize)
1946 {
1947    const char *attrbuf;
1948    DilloUrl *url;
1949    CssPropertyList props;
1950 
1951    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "src")))
1952       return;
1953 
1954    if (!(url = a_Html_url_new(html, attrbuf, NULL, 0)))
1955       return;
1956 
1957    if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
1958       /* visited frame */
1959       html->styleEngine->setPseudoVisited ();
1960    } else {
1961       /* unvisited frame */
1962       html->styleEngine->setPseudoLink ();
1963    }
1964 
1965    html->styleEngine->setNonCssHint (PROPERTY_X_LINK, CSS_TYPE_INTEGER,
1966                                      Html_set_new_link(html,&url));
1967 }
1968 
1969 static void
Html_tag_content_frame(DilloHtml * html,const char * tag,int tagsize)1970  Html_tag_content_frame (DilloHtml *html, const char *tag, int tagsize)
1971 {
1972    const char *attrbuf;
1973    char *src;
1974    Textblock *textblock;
1975    Widget *bullet;
1976 
1977    textblock = HT2TB(html);
1978 
1979    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "src")))
1980       return;
1981 
1982    src = dStrdup(attrbuf);
1983 
1984    textblock->addParbreak (5, html->wordStyle ());
1985 
1986    bullet = new Bullet();
1987    textblock->addWidget(bullet, html->wordStyle ());
1988    textblock->addSpace(html->wordStyle ());
1989 
1990    if (D_ASCII_TOLOWER(tag[1]) == 'i') {
1991       /* IFRAME usually comes with very long advertising/spying URLS,
1992        * to not break rendering we will force name="IFRAME" */
1993       textblock->addText ("IFRAME", html->wordStyle ());
1994 
1995    } else {
1996       /* FRAME:
1997        * If 'name' tag is present use it, if not use 'src' value */
1998       if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "name"))) {
1999          textblock->addText (src, html->wordStyle ());
2000       } else {
2001          textblock->addText (attrbuf, html->wordStyle ());
2002       }
2003    }
2004 
2005    textblock->addParbreak (5, html->wordStyle ());
2006 
2007    dFree(src);
2008 }
2009 
2010 /*
2011  * <FRAMESET>
2012  * TODO: This is just a temporary fix while real frame support
2013  *       isn't finished. Imitates lynx/w3m's frames.
2014  */
Html_tag_content_frameset(DilloHtml * html,const char * tag,int tagsize)2015 static void Html_tag_content_frameset (DilloHtml *html,
2016                                     const char *tag, int tagsize)
2017 {
2018    HT2TB(html)->addParbreak (9, html->wordStyle ());
2019    HT2TB(html)->addText("--FRAME--", html->wordStyle ());
2020    Html_add_textblock(html, 5);
2021 }
2022 
2023 /*
2024  * <H1> | <H2> | <H3> | <H4> | <H5> | <H6>
2025  */
Html_tag_open_h(DilloHtml * html,const char * tag,int tagsize)2026 static void Html_tag_open_h(DilloHtml *html, const char *tag, int tagsize)
2027 {
2028    a_Html_tag_set_align_attr (html, tag, tagsize);
2029 
2030    a_Html_stash_init(html);
2031    S_TOP(html)->parse_mode =
2032       DILLO_HTML_PARSE_MODE_STASH_AND_BODY;
2033 }
2034 
2035 /*
2036  * <BR>
2037  */
Html_tag_content_br(DilloHtml * html,const char * tag,int tagsize)2038 static void Html_tag_content_br(DilloHtml *html, const char *tag, int tagsize)
2039 {
2040    HT2TB(html)->addLinebreak (html->wordStyle ());
2041 }
2042 
2043 /*
2044  * <FONT>
2045  */
Html_tag_open_font(DilloHtml * html,const char * tag,int tagsize)2046 static void Html_tag_open_font(DilloHtml *html, const char *tag, int tagsize)
2047 {
2048    const char *attrbuf;
2049    char *fontFamily = NULL;
2050    int32_t color;
2051 
2052    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "color"))) {
2053       if (prefs.contrast_visited_color && html->InVisitedLink) {
2054          color = html->visited_color;
2055       } else {
2056          /* use the tag-specified color */
2057          color = a_Html_color_parse(html, attrbuf, -1);
2058       }
2059       if (color != -1)
2060          html->styleEngine->setNonCssHint (CSS_PROPERTY_COLOR,
2061                                            CSS_TYPE_COLOR, color);
2062    }
2063 
2064    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "face"))) {
2065       fontFamily = dStrdup(attrbuf);
2066        html->styleEngine->setNonCssHint (CSS_PROPERTY_FONT_FAMILY,
2067                                          CSS_TYPE_SYMBOL, fontFamily);
2068    }
2069 
2070    dFree(fontFamily);
2071 }
2072 
2073 /*
2074  * <ABBR>
2075  */
Html_tag_open_abbr(DilloHtml * html,const char * tag,int tagsize)2076 static void Html_tag_open_abbr(DilloHtml *html, const char *tag, int tagsize)
2077 {
2078    const char *attrbuf;
2079 
2080    html->styleEngine->inheritBackgroundColor ();
2081 
2082    if (prefs.show_tooltip &&
2083        (attrbuf = a_Html_get_attr(html, tag, tagsize, "title"))) {
2084 
2085       html->styleEngine->setNonCssHint (PROPERTY_X_TOOLTIP, CSS_TYPE_STRING,
2086                                         attrbuf);
2087    }
2088 }
2089 
2090 /*
2091  * Read image-associated tag attributes and create new image.
2092  */
a_Html_common_image_attrs(DilloHtml * html,const char * tag,int tagsize)2093 void a_Html_common_image_attrs(DilloHtml *html, const char *tag, int tagsize)
2094 {
2095    char *width_ptr, *height_ptr;
2096    const char *attrbuf;
2097    CssLength l_w  = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
2098    CssLength l_h  = CSS_CREATE_LENGTH(0.0, CSS_LENGTH_TYPE_AUTO);
2099    int w = 0, h = 0;
2100 
2101    if (prefs.show_tooltip &&
2102        (attrbuf = a_Html_get_attr(html, tag, tagsize, "title"))) {
2103       html->styleEngine->setNonCssHint(PROPERTY_X_TOOLTIP, CSS_TYPE_STRING,
2104                                        attrbuf);
2105    }
2106    width_ptr = a_Html_get_attr_wdef(html, tag, tagsize, "width", NULL);
2107    height_ptr = a_Html_get_attr_wdef(html, tag, tagsize, "height", NULL);
2108    // Check for malicious values
2109    // TODO: the same for percentage and relative lengths.
2110    if (width_ptr) {
2111       l_w = a_Html_parse_length (html, width_ptr);
2112       w = (int) (CSS_LENGTH_TYPE(l_w) == CSS_LENGTH_TYPE_PX ?
2113                  CSS_LENGTH_VALUE(l_w) : 0);
2114    }
2115    if (height_ptr) {
2116       l_h = a_Html_parse_length (html, height_ptr);
2117       h = (int) (CSS_LENGTH_TYPE(l_h) == CSS_LENGTH_TYPE_PX ?
2118                  CSS_LENGTH_VALUE(l_h) : 0);
2119    }
2120    /* Check for suspicious image size request that would cause
2121     * an excessive amount of memory to be allocated for the
2122     * image buffer.
2123     * Be careful to avoid integer overflows during the checks.
2124     * There is an additional check in dw/image.cc to catch cases
2125     * where only one dimension is given and the image is scaled
2126     * preserving its original aspect ratio.
2127     * Size requests passed via CSS are also checked there.
2128     */
2129    if (w < 0 || h < 0 ||
2130        w > IMAGE_MAX_AREA || h > IMAGE_MAX_AREA ||
2131        (h > 0 &&  w > IMAGE_MAX_AREA / h)) {
2132       dFree(width_ptr);
2133       dFree(height_ptr);
2134       width_ptr = height_ptr = NULL;
2135       MSG("a_Html_common_image_attrs: suspicious image size request %d x %d\n",
2136           w, h);
2137    } else {
2138       if (CSS_LENGTH_TYPE(l_w) != CSS_LENGTH_TYPE_AUTO)
2139          html->styleEngine->setNonCssHint (CSS_PROPERTY_WIDTH,
2140                                            CSS_TYPE_LENGTH_PERCENTAGE, l_w);
2141       if (CSS_LENGTH_TYPE(l_h) != CSS_LENGTH_TYPE_AUTO)
2142          html->styleEngine->setNonCssHint (CSS_PROPERTY_HEIGHT,
2143                                            CSS_TYPE_LENGTH_PERCENTAGE, l_h);
2144    }
2145 
2146    /* TODO: we should scale the image respecting its ratio.
2147     *       As the image size is not known at this time, maybe a flag
2148     *       can be set to scale it later.
2149    if ((width_ptr && !height_ptr) || (height_ptr && !width_ptr))
2150       [...]
2151    */
2152 
2153    /* x_img is an index to a list of {url,image} pairs.
2154     * We know a_Html_image_new() will use size() as its next index */
2155    html->styleEngine->setNonCssHint (PROPERTY_X_IMG, CSS_TYPE_INTEGER,
2156                                      html->images->size());
2157 
2158 
2159    dFree(width_ptr);
2160    dFree(height_ptr);
2161 }
2162 
a_Html_image_new(DilloHtml * html,const char * tag,int tagsize)2163 DilloImage *a_Html_image_new(DilloHtml *html, const char *tag, int tagsize)
2164 {
2165    bool load_now;
2166    char *alt_ptr;
2167    const char *attrbuf;
2168    DilloUrl *url;
2169    DilloImage *image;
2170 
2171    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "src")) ||
2172        !(url = a_Html_url_new(html, attrbuf, NULL, 0)))
2173       return NULL;
2174 
2175    alt_ptr = a_Html_get_attr_wdef(html, tag, tagsize, "alt", NULL);
2176    if ((!alt_ptr || !*alt_ptr) && !prefs.load_images) {
2177       dFree(alt_ptr);
2178       alt_ptr = dStrdup("[IMG]"); // Place holder for img_off mode
2179    }
2180 
2181    dw::Image *dw = new dw::Image(alt_ptr);
2182    image =
2183       a_Image_new(html->dw->getLayout(), (void*)(dw::core::ImgRenderer*)dw, 0);
2184 
2185    if (HT2TB(html)->getBgColor())
2186       image->bg_color = HT2TB(html)->getBgColor()->getColor();
2187 
2188    DilloHtmlImage *hi = dNew(DilloHtmlImage, 1);
2189    hi->url = url;
2190    html->images->increase();
2191    html->images->set(html->images->size() - 1, hi);
2192 
2193    load_now = prefs.load_images ||
2194               !dStrAsciiCasecmp(URL_SCHEME(url), "data") ||
2195               (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached);
2196 
2197    if (load_now && Html_load_image(html->bw, url, html->page_url, image)) {
2198       // hi->image is NULL if dillo tries to load the image immediately
2199       hi->image = NULL;
2200    } else {
2201       // otherwise a reference is kept in html->images
2202       hi->image = image;
2203       a_Image_ref(image);
2204    }
2205 
2206    dFree(alt_ptr);
2207    return image;
2208 }
2209 
2210 /*
2211  * Tell cache to retrieve image
2212  */
Html_load_image(BrowserWindow * bw,DilloUrl * url,const DilloUrl * requester,DilloImage * Image)2213 static bool Html_load_image(BrowserWindow *bw, DilloUrl *url,
2214                             const DilloUrl *requester, DilloImage *Image)
2215 {
2216    DilloWeb *Web;
2217    int ClientKey;
2218    /* Fill a Web structure for the cache query */
2219    Web = a_Web_new(bw, url, requester);
2220    Web->Image = Image;
2221    a_Image_ref(Image);
2222    Web->flags |= WEB_Image;
2223    /* Request image data from the cache */
2224    if ((ClientKey = a_Capi_open_url(Web, NULL, NULL)) != 0) {
2225       a_Bw_add_client(bw, ClientKey, 0);
2226       a_Bw_add_url(bw, url);
2227    }
2228    return ClientKey != 0;
2229 }
2230 
Html_tag_open_img(DilloHtml * html,const char * tag,int tagsize)2231 static void Html_tag_open_img(DilloHtml *html, const char *tag, int tagsize)
2232 {
2233    int space, border;
2234    const char *attrbuf;
2235 
2236    a_Html_common_image_attrs(html, tag, tagsize);
2237 
2238    /* Spacing to the left and right */
2239    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "hspace"))) {
2240       space = strtol(attrbuf, NULL, 10);
2241       if (space > 0) {
2242          space = CSS_CREATE_LENGTH(space, CSS_LENGTH_TYPE_PX);
2243          html->styleEngine->setNonCssHint (CSS_PROPERTY_MARGIN_LEFT,
2244                                            CSS_TYPE_LENGTH_PERCENTAGE, space);
2245          html->styleEngine->setNonCssHint (CSS_PROPERTY_MARGIN_RIGHT,
2246                                            CSS_TYPE_LENGTH_PERCENTAGE, space);
2247       }
2248    }
2249 
2250    /* Spacing at the top and bottom */
2251    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "vspace"))) {
2252       space = strtol(attrbuf, NULL, 10);
2253       if (space > 0) {
2254          space = CSS_CREATE_LENGTH(space, CSS_LENGTH_TYPE_PX);
2255          html->styleEngine->setNonCssHint (CSS_PROPERTY_MARGIN_TOP,
2256                                            CSS_TYPE_LENGTH_PERCENTAGE, space);
2257          html->styleEngine->setNonCssHint (CSS_PROPERTY_MARGIN_BOTTOM,
2258                                            CSS_TYPE_LENGTH_PERCENTAGE, space);
2259       }
2260    }
2261 
2262    /* Border */
2263    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "border"))) {
2264       border = strtol(attrbuf, NULL, 10);
2265       if (border >= 0) {
2266          border = CSS_CREATE_LENGTH(border, CSS_LENGTH_TYPE_PX);
2267          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_TOP_WIDTH,
2268                                            CSS_TYPE_LENGTH_PERCENTAGE, border);
2269          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_BOTTOM_WIDTH,
2270                                            CSS_TYPE_LENGTH_PERCENTAGE, border);
2271          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_LEFT_WIDTH,
2272                                            CSS_TYPE_LENGTH_PERCENTAGE, border);
2273          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_RIGHT_WIDTH,
2274                                            CSS_TYPE_LENGTH_PERCENTAGE, border);
2275 
2276          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_TOP_STYLE,
2277                                            CSS_TYPE_ENUM, BORDER_SOLID);
2278          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_BOTTOM_STYLE,
2279                                            CSS_TYPE_ENUM, BORDER_SOLID);
2280          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_LEFT_STYLE,
2281                                            CSS_TYPE_ENUM, BORDER_SOLID);
2282          html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_RIGHT_STYLE,
2283                                            CSS_TYPE_ENUM, BORDER_SOLID);
2284       }
2285    }
2286 
2287 }
2288 
2289 /*
2290  * Create a new Image struct and request the image-url to the cache
2291  * (If it either hits or misses, is not relevant here; that's up to the
2292  *  cache functions)
2293  */
Html_tag_content_img(DilloHtml * html,const char * tag,int tagsize)2294 static void Html_tag_content_img(DilloHtml *html, const char *tag, int tagsize)
2295 {
2296    DilloImage *Image;
2297    DilloUrl *usemap_url;
2298    const char *attrbuf;
2299 
2300    /* This avoids loading images. Useful for viewing suspicious HTML email. */
2301    if (URL_FLAGS(html->base_url) & URL_SpamSafe)
2302       return;
2303 
2304    Image = a_Html_image_new(html, tag, tagsize);
2305    if (!Image)
2306       return;
2307 
2308    usemap_url = NULL;
2309    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "usemap")))
2310       /* TODO: usemap URLs outside of the document are not used. */
2311       usemap_url = a_Html_url_new(html, attrbuf, NULL, 0);
2312 
2313    // At this point, we know that Image->ir represents an image
2314    // widget. Notice that the order of the casts matters, because of
2315    // multiple inheritance.
2316    dw::Image *dwi = (dw::Image*)(dw::core::ImgRenderer*)Image->img_rndr;
2317    HT2TB(html)->addWidget(dwi, html->style());
2318 
2319    /* Image maps */
2320    if (a_Html_get_attr(html, tag, tagsize, "ismap")) {
2321       dwi->setIsMap();
2322       _MSG("  Html_tag_open_img: server-side map (ISMAP)\n");
2323    } else if (html->style ()->x_link != -1 &&
2324               usemap_url == NULL) {
2325       /* For simple links, we have to suppress the "image_pressed" signal.
2326        * This is overridden for USEMAP images. */
2327 //    a_Dw_widget_set_button_sensitive (IM2DW(Image->dw), FALSE);
2328    }
2329 
2330    if (usemap_url) {
2331       dwi->setUseMap(&html->maps, new ::object::String(URL_STR(usemap_url)));
2332       a_Url_free (usemap_url);
2333    }
2334 }
2335 
2336 /*
2337  * <map>
2338  */
Html_tag_content_map(DilloHtml * html,const char * tag,int tagsize)2339 static void Html_tag_content_map(DilloHtml *html, const char *tag, int tagsize)
2340 {
2341    char *hash_name;
2342    const char *attrbuf;
2343    DilloUrl *url;
2344 
2345    if (html->InFlags & IN_MAP) {
2346       BUG_MSG("Nested <map>.");
2347    } else {
2348       if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "name"))) {
2349          html->InFlags |= IN_MAP;
2350          hash_name = dStrconcat("#", attrbuf, NULL);
2351          url = a_Html_url_new(html, hash_name, NULL, 0);
2352          html->maps.startNewMap(new ::object::String(URL_STR(url)));
2353          a_Url_free (url);
2354          dFree(hash_name);
2355       } else {
2356          BUG_MSG("<map> requires name attribute.");
2357       }
2358    }
2359 }
2360 
2361 /*
2362  * Handle close <MAP>
2363  */
Html_tag_close_map(DilloHtml * html)2364 static void Html_tag_close_map(DilloHtml *html)
2365 {
2366    /* This is a hack for the perhaps frivolous feature of drawing image map
2367     * shapes when there is no image to display. If this map is defined after
2368     * an image that has not been loaded (img != NULL), tell the image to
2369     * redraw. (It will only do so if it uses a map.)
2370     */
2371    for (int i = 0; i < html->images->size(); i++) {
2372       DilloImage *img = html->images->get(i)->image;
2373 
2374       if (img) {
2375          // At this point, we know that img->ir represents an image
2376          // widget. (Really? Is this assumtion safe?) Notice that the
2377          // order of the casts matters, because of multiple
2378          // inheritance.
2379          dw::Image *dwi = (dw::Image*)(dw::core::ImgRenderer*)img->img_rndr;
2380          dwi->forceMapRedraw();
2381       }
2382    }
2383    html->InFlags &= ~IN_MAP;
2384 }
2385 
2386 /*
2387  * Read coords in a string, returning a vector of ints.
2388  */
2389 static
Html_read_coords(DilloHtml * html,const char * str)2390 misc::SimpleVector<int> *Html_read_coords(DilloHtml *html, const char *str)
2391 {
2392    int coord;
2393    const char *tail = str;
2394    char *newtail = NULL;
2395    misc::SimpleVector<int> *coords = new misc::SimpleVector<int> (4);
2396 
2397    while (1) {
2398       coord = strtol(tail, &newtail, 10);
2399       if (coord == 0 && newtail == tail)
2400          break;
2401       coords->increase();
2402       coords->set(coords->size() - 1, coord);
2403       while (isspace(*newtail))
2404          newtail++;
2405       if (!*newtail)
2406          break;
2407       if (*newtail != ',') {
2408          BUG_MSG("<area> coords must be integers separated by commas.");
2409       }
2410       tail = newtail + 1;
2411    }
2412 
2413    return coords;
2414 }
2415 
2416 /*
2417  * <AREA>
2418  */
2419 static void
Html_tag_content_area(DilloHtml * html,const char * tag,int tagsize)2420  Html_tag_content_area(DilloHtml *html, const char *tag, int tagsize)
2421 {
2422    enum types {UNKNOWN, RECTANGLE, CIRCLE, POLYGON, BACKGROUND};
2423    types type;
2424    misc::SimpleVector<int> *coords = NULL;
2425    DilloUrl* url;
2426    const char *attrbuf;
2427    int link = -1;
2428    Shape *shape = NULL;
2429 
2430    if (!(html->InFlags & IN_MAP)) {
2431       BUG_MSG("<area> not inside <map>.");
2432       return;
2433    }
2434    attrbuf = a_Html_get_attr(html, tag, tagsize, "shape");
2435 
2436    if (!attrbuf || !*attrbuf || !dStrAsciiCasecmp(attrbuf, "rect")) {
2437       /* the default shape is a rectangle */
2438       type = RECTANGLE;
2439    } else if (dStrAsciiCasecmp(attrbuf, "default") == 0) {
2440       /* "default" is the background */
2441       type = BACKGROUND;
2442    } else if (dStrAsciiCasecmp(attrbuf, "circle") == 0) {
2443       type = CIRCLE;
2444    } else if (dStrnAsciiCasecmp(attrbuf, "poly", 4) == 0) {
2445       type = POLYGON;
2446    } else {
2447       BUG_MSG("<area> unknown shape: '%s'.", attrbuf);
2448       type = UNKNOWN;
2449    }
2450    if (type == RECTANGLE || type == CIRCLE || type == POLYGON) {
2451       /* TODO: add support for coords in % */
2452       if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "coords"))) {
2453          coords = Html_read_coords(html, attrbuf);
2454 
2455          if (type == RECTANGLE) {
2456             if (coords->size() != 4)
2457                BUG_MSG("<area> rectangle must have four coordinate values.");
2458             if (coords->size() >= 4)
2459                shape = new Rectangle(coords->get(0),
2460                                      coords->get(1),
2461                                      coords->get(2) - coords->get(0),
2462                                      coords->get(3) - coords->get(1));
2463          } else if (type == CIRCLE) {
2464             if (coords->size() != 3)
2465                BUG_MSG("<area> circle must have three coordinate values.");
2466             if (coords->size() >= 3)
2467                shape = new Circle(coords->get(0), coords->get(1),
2468                                   coords->get(2));
2469          } else if (type == POLYGON) {
2470             Polygon *poly;
2471             int i;
2472             if (coords->size() % 2)
2473                BUG_MSG("<area> polygon with odd number of coordinates.");
2474             shape = poly = new Polygon();
2475             for (i = 0; i < (coords->size() / 2); i++)
2476                poly->addPoint(coords->get(2*i), coords->get(2*i + 1));
2477          }
2478          delete(coords);
2479       }
2480    }
2481    if (shape != NULL || type == BACKGROUND) {
2482       if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "href"))) {
2483          url = a_Html_url_new(html, attrbuf, NULL, 0);
2484          dReturn_if_fail ( url != NULL );
2485          if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "alt")))
2486             a_Url_set_alt(url, attrbuf);
2487 
2488          link = Html_set_new_link(html, &url);
2489       }
2490       if (type == BACKGROUND)
2491          html->maps.setCurrentMapDefaultLink(link);
2492       else
2493          html->maps.addShapeToCurrentMap(shape, link);
2494    }
2495 }
2496 
2497 /*
2498  * <OBJECT>
2499  * Simply provide a link if the object is something downloadable.
2500  */
Html_tag_open_object(DilloHtml * html,const char * tag,int tagsize)2501 static void Html_tag_open_object(DilloHtml *html, const char *tag, int tagsize)
2502 {
2503    DilloUrl *url, *base_url = NULL;
2504    const char *attrbuf;
2505 
2506    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "codebase"))) {
2507       base_url = a_Html_url_new(html, attrbuf, NULL, 0);
2508    }
2509 
2510    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "data"))) {
2511       url = a_Html_url_new(html, attrbuf,
2512                            URL_STR(base_url), (base_url != NULL));
2513       dReturn_if_fail ( url != NULL );
2514 
2515       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2516          html->styleEngine->setPseudoVisited ();
2517       } else {
2518          html->styleEngine->setPseudoLink ();
2519       }
2520 
2521       html->styleEngine->setNonCssHint(PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2522                                        Html_set_new_link(html, &url));
2523    }
2524    a_Url_free(base_url);
2525 }
2526 
Html_tag_content_object(DilloHtml * html,const char * tag,int tagsize)2527 static void Html_tag_content_object(DilloHtml *html, const char *tag,
2528                                     int tagsize)
2529 {
2530    if (a_Html_get_attr(html, tag, tagsize, "data"))
2531       HT2TB(html)->addText("[OBJECT]", html->wordStyle ());
2532 }
2533 
2534 /*
2535  * <VIDEO>
2536  * Provide a link to the video.
2537  */
Html_tag_open_video(DilloHtml * html,const char * tag,int tagsize)2538 static void Html_tag_open_video(DilloHtml *html, const char *tag, int tagsize)
2539 {
2540    DilloUrl *url;
2541    const char *attrbuf;
2542 
2543    if (html->InFlags & IN_MEDIA) {
2544       MSG("<video> not handled when already inside a media element.\n");
2545       return;
2546    }
2547    /* TODO: poster attr */
2548 
2549    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "src"))) {
2550       url = a_Html_url_new(html, attrbuf, NULL, 0);
2551       dReturn_if_fail ( url != NULL );
2552 
2553       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2554          html->styleEngine->setPseudoVisited ();
2555       } else {
2556          html->styleEngine->setPseudoLink ();
2557       }
2558 
2559       html->styleEngine->setNonCssHint(PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2560                                        Html_set_new_link(html, &url));
2561 
2562       HT2TB(html)->addText("[VIDEO]", html->wordStyle ());
2563    }
2564    html->InFlags |= IN_MEDIA;
2565 }
2566 
2567 /*
2568  * <AUDIO>
2569  * Provide a link to the audio.
2570  */
Html_tag_open_audio(DilloHtml * html,const char * tag,int tagsize)2571 static void Html_tag_open_audio(DilloHtml *html, const char *tag, int tagsize)
2572 {
2573    DilloUrl *url;
2574    const char *attrbuf;
2575 
2576    if (html->InFlags & IN_MEDIA) {
2577       MSG("<audio> not handled when already inside a media element.\n");
2578       return;
2579    }
2580 
2581    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "src"))) {
2582       url = a_Html_url_new(html, attrbuf, NULL, 0);
2583       dReturn_if_fail ( url != NULL );
2584 
2585       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2586          html->styleEngine->setPseudoVisited ();
2587       } else {
2588          html->styleEngine->setPseudoLink ();
2589       }
2590 
2591       html->styleEngine->setNonCssHint(PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2592                                        Html_set_new_link(html, &url));
2593 
2594       HT2TB(html)->addText("[AUDIO]", html->wordStyle ());
2595    }
2596    html->InFlags |= IN_MEDIA;
2597 }
2598 
2599 /*
2600  * <SOURCE>
2601  * Media resource; provide a link to its address.
2602  */
Html_tag_open_source(DilloHtml * html,const char * tag,int tagsize)2603 static void Html_tag_open_source(DilloHtml *html, const char *tag,
2604                                     int tagsize)
2605 {
2606    const char *attrbuf;
2607 
2608    if (!(html->InFlags & IN_MEDIA)) {
2609       BUG_MSG("<source> not inside a media element.");
2610       return;
2611    }
2612    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "src"))) {
2613       BUG_MSG("<source> requires src attribute.");
2614       return;
2615    } else {
2616       DilloUrl *url = a_Html_url_new(html, attrbuf, NULL, 0);
2617 
2618       dReturn_if_fail ( url != NULL );
2619 
2620       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2621          html->styleEngine->setPseudoVisited ();
2622       } else {
2623          html->styleEngine->setPseudoLink ();
2624       }
2625       html->styleEngine->setNonCssHint(PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2626                                        Html_set_new_link(html, &url));
2627    }
2628 }
2629 
Html_tag_content_source(DilloHtml * html,const char * tag,int tagsize)2630 static void Html_tag_content_source(DilloHtml *html, const char *tag,
2631                                     int tagsize)
2632 {
2633    if ((html->InFlags & IN_MEDIA) && a_Html_get_attr(html, tag, tagsize,"src"))
2634       HT2TB(html)->addText("[MEDIA SOURCE]", html->wordStyle ());
2635 }
2636 
2637 /*
2638  * Media (AUDIO/VIDEO) close function
2639  */
Html_tag_close_media(DilloHtml * html)2640 static void Html_tag_close_media(DilloHtml *html)
2641 {
2642    html->InFlags &= ~IN_MEDIA;
2643 }
2644 
2645 /*
2646  * <EMBED>
2647  * Provide a link to embedded content.
2648  */
Html_tag_open_embed(DilloHtml * html,const char * tag,int tagsize)2649 static void Html_tag_open_embed(DilloHtml *html, const char *tag, int tagsize)
2650 {
2651    const char *attrbuf;
2652 
2653    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "src"))) {
2654       DilloUrl *url = a_Html_url_new(html, attrbuf, NULL, 0);
2655 
2656       dReturn_if_fail ( url != NULL );
2657 
2658       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2659          html->styleEngine->setPseudoVisited ();
2660       } else {
2661          html->styleEngine->setPseudoLink ();
2662       }
2663 
2664       html->styleEngine->setNonCssHint(PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2665                                        Html_set_new_link(html, &url));
2666    }
2667 }
2668 
Html_tag_content_embed(DilloHtml * html,const char * tag,int tagsize)2669 static void Html_tag_content_embed(DilloHtml *html,const char *tag,int tagsize)
2670 {
2671    if (a_Html_get_attr(html, tag, tagsize, "src"))
2672       HT2TB(html)->addText("[EMBED]", html->wordStyle ());
2673 }
2674 
2675 /*
2676  * Test and extract the link from a javascript instruction.
2677  */
Html_get_javascript_link(DilloHtml * html)2678 static const char* Html_get_javascript_link(DilloHtml *html)
2679 {
2680    size_t i;
2681    char ch, *p1, *p2;
2682    Dstr *Buf = html->attr_data;
2683 
2684    if (dStrnAsciiCasecmp("javascript", Buf->str, 10) == 0) {
2685       i = strcspn(Buf->str, "'\"");
2686       ch = Buf->str[i];
2687       if ((ch == '"' || ch == '\'') &&
2688           (p2 = strchr(Buf->str + i + 1 , ch))) {
2689          p1 = Buf->str + i;
2690          BUG_MSG("Link depends on javascript().");
2691          dStr_truncate(Buf, p2 - Buf->str);
2692          dStr_erase(Buf, 0, p1 - Buf->str + 1);
2693       }
2694    }
2695    return Buf->str;
2696 }
2697 
2698 /*
2699  * Register an anchor for this page.
2700  */
Html_add_anchor(DilloHtml * html,const char * name)2701 static void Html_add_anchor(DilloHtml *html, const char *name)
2702 {
2703    _MSG("Registering ANCHOR: %s\n", name);
2704    if (!HT2TB(html)->addAnchor (name, html->style ()))
2705       BUG_MSG("Anchor names must be unique within the document (\"%s\").",
2706               name);
2707    /*
2708     * According to Sec. 12.2.1 of the HTML 4.01 spec, "anchor names that
2709     * differ only in case may not appear in the same document", but
2710     * "comparisons between fragment identifiers and anchor names must be
2711     * done by exact (case-sensitive) match." We ignore the case issue and
2712     * always test for exact matches. Moreover, what does uppercase mean
2713     * for Unicode characters outside the ASCII range?
2714     */
2715 }
2716 
2717 /*
2718  * <A>
2719  */
Html_tag_open_a(DilloHtml * html,const char * tag,int tagsize)2720 static void Html_tag_open_a(DilloHtml *html, const char *tag, int tagsize)
2721 {
2722    DilloUrl *url;
2723    const char *attrbuf;
2724 
2725    /* TODO: add support for MAP with A HREF */
2726    if (html->InFlags & IN_MAP)
2727       Html_tag_content_area(html, tag, tagsize);
2728 
2729    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "href"))) {
2730       /* if it's a javascript link, extract the reference. */
2731       if (D_ASCII_TOLOWER(attrbuf[0]) == 'j')
2732          attrbuf = Html_get_javascript_link(html);
2733 
2734       url = a_Html_url_new(html, attrbuf, NULL, 0);
2735       dReturn_if_fail ( url != NULL );
2736 
2737       if (a_Capi_get_flags_with_redirection(url) & CAPI_IsCached) {
2738          html->InVisitedLink = true;
2739          html->styleEngine->setPseudoVisited ();
2740          if (html->non_css_visited_color != -1)
2741             html->styleEngine->setNonCssHint(CSS_PROPERTY_COLOR,
2742                                              CSS_TYPE_COLOR,
2743                                              html->non_css_visited_color);
2744       } else {
2745          html->styleEngine->setPseudoLink ();
2746          if (html->non_css_link_color != -1)
2747             html->styleEngine->setNonCssHint(CSS_PROPERTY_COLOR,
2748                                              CSS_TYPE_COLOR,
2749                                              html->non_css_link_color);
2750       }
2751 
2752       html->styleEngine->setNonCssHint (PROPERTY_X_LINK, CSS_TYPE_INTEGER,
2753                                         Html_set_new_link(html, &url));
2754    }
2755    if (prefs.show_tooltip &&
2756        (attrbuf = a_Html_get_attr(html, tag, tagsize, "title"))) {
2757       html->styleEngine->setNonCssHint (PROPERTY_X_TOOLTIP, CSS_TYPE_STRING,
2758                                         attrbuf);
2759    }
2760 
2761    html->styleEngine->inheritBackgroundColor ();
2762 
2763    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "name"))) {
2764       char *nameVal;
2765       const char *id = html->styleEngine->getId ();
2766 
2767       if (prefs.show_extra_warnings)
2768          Html_check_name_val(html, attrbuf, "name");
2769 
2770       nameVal = a_Url_decode_hex_str(attrbuf);
2771 
2772       if (nameVal) {
2773          /* We compare the "id" value with the url-decoded "name" value */
2774          if (!id || strcmp(nameVal, id)) {
2775             if (id)
2776                BUG_MSG("In <a>, id ('%s') and name ('%s') attributes differ.",
2777                         id, nameVal);
2778             Html_add_anchor(html, nameVal);
2779          }
2780 
2781          dFree(nameVal);
2782       }
2783    }
2784 }
2785 
2786 /*
2787  * <A> close function
2788  */
Html_tag_close_a(DilloHtml * html)2789 static void Html_tag_close_a(DilloHtml *html)
2790 {
2791    html->InVisitedLink = false;
2792 }
2793 
2794 /*
2795  * <BLOCKQUOTE>
2796  */
Html_tag_open_blockquote(DilloHtml * html,const char * tag,int tagsize)2797 static void Html_tag_open_blockquote(DilloHtml *html,
2798                                      const char *tag, int tagsize)
2799 {
2800    Html_add_textblock(html, 9);
2801 }
2802 
2803 /*
2804  * <Q>
2805  */
Html_tag_open_q(DilloHtml * html,const char * tag,int tagsize)2806 static void Html_tag_open_q(DilloHtml *html, const char *tag, int tagsize)
2807 {
2808    /*
2809     * Left Double Quotation Mark, which is wrong in many cases, but
2810     * should at least be widely recognized.
2811     */
2812    const char *U201C = "\xe2\x80\x9c";
2813 
2814    html->styleEngine->inheritBackgroundColor ();
2815    HT2TB(html)->addText (U201C, html->wordStyle ());
2816 }
2817 
2818 /*
2819  * </Q>
2820  */
Html_tag_close_q(DilloHtml * html)2821 static void Html_tag_close_q(DilloHtml *html)
2822 {
2823    /* Right Double Quotation Mark */
2824    const char *U201D = "\xe2\x80\x9d";
2825 
2826    HT2TB(html)->addText (U201D, html->wordStyle ());
2827 }
2828 
2829 /*
2830  * Handle the <UL> tag.
2831  */
Html_tag_open_ul(DilloHtml * html,const char * tag,int tagsize)2832 static void Html_tag_open_ul(DilloHtml *html, const char *tag, int tagsize)
2833 {
2834    const char *attrbuf;
2835    ListStyleType list_style_type;
2836 
2837    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "type"))) {
2838 
2839       /* list_style_type explicitly defined */
2840       if (dStrAsciiCasecmp(attrbuf, "disc") == 0)
2841          list_style_type = LIST_STYLE_TYPE_DISC;
2842       else if (dStrAsciiCasecmp(attrbuf, "circle") == 0)
2843          list_style_type = LIST_STYLE_TYPE_CIRCLE;
2844       else if (dStrAsciiCasecmp(attrbuf, "square") == 0)
2845          list_style_type = LIST_STYLE_TYPE_SQUARE;
2846       else
2847          /* invalid value */
2848          list_style_type = LIST_STYLE_TYPE_DISC;
2849 
2850       html->styleEngine->setNonCssHint (CSS_PROPERTY_LIST_STYLE_TYPE,
2851                                         CSS_TYPE_ENUM, list_style_type);
2852       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
2853          BUG_MSG("<ul> type attribute is obsolete.");
2854    }
2855 
2856    S_TOP(html)->list_type = HTML_LIST_UNORDERED;
2857    S_TOP(html)->list_number = 0;
2858    S_TOP(html)->ref_list_item = NULL;
2859 }
2860 
2861 /*
2862  * Handle the <DIR> or <MENU> tag.
2863  * (Deprecated and almost the same as <UL>)
2864  */
Html_tag_open_dir(DilloHtml * html,const char * tag,int tagsize)2865 static void Html_tag_open_dir(DilloHtml *html, const char *tag, int tagsize)
2866 {
2867    html->styleEngine->inheritBackgroundColor ();
2868    HT2TB(html)->addParbreak (9, html->wordStyle ());
2869 
2870    S_TOP(html)->list_type = HTML_LIST_UNORDERED;
2871    S_TOP(html)->list_number = 0;
2872    S_TOP(html)->ref_list_item = NULL;
2873 
2874    if (prefs.show_extra_warnings)
2875       BUG_MSG("Obsolete list type; use <ul> instead.");
2876 }
2877 
2878 /*
2879  * Handle the <MENU> tag.
2880  */
Html_tag_open_menu(DilloHtml * html,const char * tag,int tagsize)2881 static void Html_tag_open_menu(DilloHtml *html, const char *tag, int tagsize)
2882 {
2883    /* In another bit of ridiculous mess from the HTML5 world, the menu
2884     * element, which was deprecated in HTML4:
2885     * - does not appear at all in W3C's HTML5 spec
2886     * - appears in WHATWG's HTML5 doc and the W3C's 5.1 draft, where it
2887     *   means something totally different than it did in the old days
2888     *   (now it's for popup menus and toolbar menus rather than being a
2889     *   sort of list).
2890     */
2891    if (!(html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f))
2892       Html_tag_open_dir(html, tag, tagsize);
2893 }
2894 
2895 /*
2896  * Handle the <OL> tag.
2897  */
Html_tag_open_ol(DilloHtml * html,const char * tag,int tagsize)2898 static void Html_tag_open_ol(DilloHtml *html, const char *tag, int tagsize)
2899 {
2900    const char *attrbuf;
2901    int n = 1;
2902 
2903    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "type"))) {
2904       ListStyleType listStyleType = LIST_STYLE_TYPE_DECIMAL;
2905 
2906       if (*attrbuf == '1')
2907          listStyleType = LIST_STYLE_TYPE_DECIMAL;
2908       else if (*attrbuf == 'a')
2909          listStyleType = LIST_STYLE_TYPE_LOWER_ALPHA;
2910       else if (*attrbuf == 'A')
2911          listStyleType = LIST_STYLE_TYPE_UPPER_ALPHA;
2912       else if (*attrbuf == 'i')
2913          listStyleType = LIST_STYLE_TYPE_LOWER_ROMAN;
2914       else if (*attrbuf == 'I')
2915          listStyleType = LIST_STYLE_TYPE_UPPER_ROMAN;
2916 
2917       html->styleEngine->setNonCssHint (CSS_PROPERTY_LIST_STYLE_TYPE,
2918                                         CSS_TYPE_ENUM, listStyleType);
2919    }
2920 
2921    S_TOP(html)->list_type = HTML_LIST_ORDERED;
2922 
2923    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "start")) &&
2924        (n = (int) strtol(attrbuf, NULL, 10)) < 0) {
2925       BUG_MSG("Illegal '-' character in START attribute; Starting from 0.");
2926       n = 0;
2927    }
2928    S_TOP(html)->list_number = n;
2929    S_TOP(html)->ref_list_item = NULL;
2930 }
2931 
2932 /*
2933  * Handle the <LI> tag.
2934  */
Html_tag_open_li(DilloHtml * html,const char * tag,int tagsize)2935 static void Html_tag_open_li(DilloHtml *html, const char *tag, int tagsize)
2936 {
2937    Style *style = html->style ();
2938    int *list_number;
2939    const char *attrbuf;
2940 
2941    if (S_TOP(html)->list_type == HTML_LIST_NONE)
2942       BUG_MSG("<li> outside <ul> or <ol>.");
2943 
2944    html->InFlags |= IN_LI;
2945 
2946    /* Get our parent tag's variables (used as state storage) */
2947    list_number = &html->stack->getRef(html->stack->size()-2)->list_number;
2948 
2949    if (style->listStyleType >= LIST_STYLE_TYPE_DECIMAL) {
2950       // ordered
2951       if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "value")) &&
2952           (*list_number = strtol(attrbuf, NULL, 10)) < 0) {
2953          BUG_MSG("Illegal negative list value attribute; Starting from 0.");
2954          *list_number = 0;
2955       }
2956    }
2957 }
2958 
2959 /*
2960  * Close <LI>
2961  */
Html_tag_close_li(DilloHtml * html)2962 static void Html_tag_close_li(DilloHtml *html)
2963 {
2964    html->InFlags &= ~IN_LI;
2965    ((ListItem *)html->dw)->flush ();
2966 }
2967 
2968 /*
2969  * <HR>
2970  */
Html_tag_open_hr(DilloHtml * html,const char * tag,int tagsize)2971 static void Html_tag_open_hr(DilloHtml *html, const char *tag, int tagsize)
2972 {
2973    char *width_ptr;
2974    const char *attrbuf;
2975    int32_t size = 0;
2976 
2977    width_ptr = a_Html_get_attr_wdef(html, tag, tagsize, "width", NULL);
2978    if (width_ptr) {
2979       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
2980          BUG_MSG("<hr> width attribute is obsolete.");
2981       html->styleEngine->setNonCssHint (CSS_PROPERTY_WIDTH,
2982                                         CSS_TYPE_LENGTH_PERCENTAGE,
2983                                         a_Html_parse_length (html, width_ptr));
2984       dFree(width_ptr);
2985    }
2986 
2987    if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "size"))) {
2988       size = strtol(attrbuf, NULL, 10);
2989       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
2990          BUG_MSG("<hr> size attribute is obsolete.");
2991    }
2992 
2993    a_Html_tag_set_align_attr(html, tag, tagsize);
2994 
2995    /* TODO: evaluate attribute */
2996    if (a_Html_get_attr(html, tag, tagsize, "noshade")) {
2997       if (html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
2998          BUG_MSG("<hr> noshade attribute is obsolete.");
2999       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_TOP_STYLE,
3000                                         CSS_TYPE_ENUM, BORDER_SOLID);
3001       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_BOTTOM_STYLE,
3002                                         CSS_TYPE_ENUM, BORDER_SOLID);
3003       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_LEFT_STYLE,
3004                                         CSS_TYPE_ENUM, BORDER_SOLID);
3005       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_RIGHT_STYLE,
3006                                         CSS_TYPE_ENUM, BORDER_SOLID);
3007 
3008       if (size <= 0)
3009          size = 1;
3010    }
3011 
3012    if (size > 0) {
3013       CssLength size_top = CSS_CREATE_LENGTH ((size+1)/2, CSS_LENGTH_TYPE_PX);
3014       CssLength size_bottom = CSS_CREATE_LENGTH (size / 2, CSS_LENGTH_TYPE_PX);
3015       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_TOP_WIDTH,
3016                                         CSS_TYPE_LENGTH_PERCENTAGE, size_top);
3017       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_LEFT_WIDTH,
3018                                         CSS_TYPE_LENGTH_PERCENTAGE, size_top);
3019       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_BOTTOM_WIDTH,
3020                                         CSS_TYPE_LENGTH_PERCENTAGE,
3021                                         size_bottom);
3022       html->styleEngine->setNonCssHint (CSS_PROPERTY_BORDER_RIGHT_WIDTH,
3023                                         CSS_TYPE_LENGTH_PERCENTAGE,
3024                                         size_bottom);
3025    }
3026 
3027 }
3028 
Html_tag_content_hr(DilloHtml * html,const char * tag,int tagsize)3029 static void Html_tag_content_hr(DilloHtml *html, const char *tag, int tagsize)
3030 {
3031    Widget *hruler;
3032    HT2TB(html)->addParbreak (5, html->wordStyle ());
3033 
3034    hruler = new Ruler();
3035    hruler->setStyle (html->style ());
3036    HT2TB(html)->addWidget (hruler, html->style ());
3037    HT2TB(html)->addParbreak (5, html->wordStyle ());
3038 }
3039 
3040 /*
3041  * <DL>
3042  */
Html_tag_open_dl(DilloHtml * html,const char * tag,int tagsize)3043 static void Html_tag_open_dl(DilloHtml *html, const char *tag, int tagsize)
3044 {
3045    /* may want to actually do some stuff here. */
3046    html->styleEngine->inheritBackgroundColor ();
3047    HT2TB(html)->addParbreak (9, html->wordStyle ());
3048 }
3049 
3050 /*
3051  * <DT>
3052  */
Html_tag_open_dt(DilloHtml * html,const char * tag,int tagsize)3053 static void Html_tag_open_dt(DilloHtml *html, const char *tag, int tagsize)
3054 {
3055    html->styleEngine->inheritBackgroundColor ();
3056    HT2TB(html)->addParbreak (9, html->wordStyle ());
3057 }
3058 
3059 /*
3060  * <DD>
3061  */
Html_tag_open_dd(DilloHtml * html,const char * tag,int tagsize)3062 static void Html_tag_open_dd(DilloHtml *html, const char *tag, int tagsize)
3063 {
3064    Html_add_textblock(html, 9);
3065 }
3066 
3067 /*
3068  * <PRE>
3069  */
Html_tag_open_pre(DilloHtml * html,const char * tag,int tagsize)3070 static void Html_tag_open_pre(DilloHtml *html, const char *tag, int tagsize)
3071 {
3072    html->styleEngine->inheritBackgroundColor ();
3073    HT2TB(html)->addParbreak (9, html->wordStyle ());
3074 
3075    html->InFlags |= IN_PRE;
3076 }
3077 
3078 /*
3079  * Custom close for <PRE>
3080  */
Html_tag_close_pre(DilloHtml * html)3081 static void Html_tag_close_pre(DilloHtml *html)
3082 {
3083    html->InFlags &= ~IN_PRE;
3084 }
3085 
3086 /*
3087  * Check whether a tag is in the "excluding" element set for PRE
3088  * Excl. Set = {IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, BASEFONT}
3089  */
Html_tag_pre_excludes(DilloHtml * html,int tag_idx)3090 static int Html_tag_pre_excludes(DilloHtml *html, int tag_idx)
3091 {
3092    if (!(html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)) {
3093       /* HTML5 doesn't say anything about excluding elements */
3094       const char *es_set[] = {"img", "object", "applet", "big", "small", "sub",
3095                               "sup", "font", "basefont", NULL};
3096       static int ei_set[10], i;
3097 
3098       /* initialize array */
3099       if (!ei_set[0])
3100          for (i = 0; es_set[i]; ++i)
3101             ei_set[i] = a_Html_tag_index(es_set[i]);
3102 
3103       for (i = 0; ei_set[i]; ++i)
3104          if (tag_idx == ei_set[i])
3105             return 1;
3106    }
3107    return 0;
3108 }
3109 
3110 /*
3111  * Update the document's content type information based on meta tag data.
3112  */
Html_update_content_type(DilloHtml * html,const char * content)3113 static void Html_update_content_type(DilloHtml *html, const char *content)
3114 {
3115    const char *new_content = a_Capi_set_content_type(html->page_url, content,
3116                                                      "meta");
3117    /* Cannot ask cache whether the content type was changed, as
3118     * this code in another bw might have already changed it for us.
3119     */
3120    if (a_Misc_content_type_cmp(html->content_type, new_content)) {
3121       html->stop_parser = true; /* The cache buffer is no longer valid */
3122       a_UIcmd_repush(html->bw);
3123    }
3124 }
3125 
3126 /*
3127  * Handle <META>
3128  * We do not support http-equiv=refresh with delay>0 because it's
3129  * non standard, (the HTML 4.01 SPEC recommends explicitly to avoid it).
3130  * More info at:
3131  *   http://lists.w3.org/Archives/Public/www-html/2000Feb/thread.html#msg232
3132  * Instant client-side redirects (delay=0) are supported:
3133  *   http://www.w3.org/TR/2008/NOTE-WCAG20-TECHS-20081211/H76.html
3134  *
3135  * TODO: Note that we're sending custom HTML while still IN_HEAD. This
3136  * is a hackish way to put the message. A much cleaner approach is to
3137  * build a custom widget for it.
3138  */
Html_tag_open_meta(DilloHtml * html,const char * tag,int tagsize)3139 static void Html_tag_open_meta(DilloHtml *html, const char *tag, int tagsize)
3140 {
3141    const char meta_template[] =
3142 "<table width='100%%'><tr><td bgcolor='#ee0000'>Warning:</td>\n"
3143 " <td bgcolor='#8899aa' width='100%%'>\n"
3144 " This page uses the NON-STANDARD meta refresh tag.<br> The HTML 4.01 SPEC\n"
3145 " (sec 7.4.4) recommends explicitly to avoid it.</td></tr>\n"
3146 " <tr><td bgcolor='#a0a0a0' colspan='2'>The author wanted you to go\n"
3147 " <a href='%s'>here</a>%s</td></tr></table><br>\n";
3148 
3149    const char *p, *equiv, *charset, *content;
3150    char delay_str[64], *mr_url;
3151    DilloUrl *new_url;
3152    int delay;
3153 
3154    /* only valid inside HEAD */
3155    if (!(html->InFlags & IN_HEAD)) {
3156       if (!((html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) &&
3157             a_Html_get_attr(html, tag, tagsize, "itemprop"))) {
3158          /* With the HTML 5.1 draft spec, meta with itemprop may appear
3159           * in the body.
3160           */
3161          BUG_MSG("This <meta> element must be inside the HEAD section.");
3162       }
3163       return;
3164    }
3165 
3166    if ((equiv = a_Html_get_attr(html, tag, tagsize, "http-equiv"))) {
3167       if (!dStrAsciiCasecmp(equiv, "refresh") &&
3168           (content = a_Html_get_attr(html, tag, tagsize, "content"))) {
3169 
3170          /* Get delay, if present, and make a message with it */
3171          if ((delay = strtol(content, NULL, 0))) {
3172             snprintf(delay_str, 64, " after %d second%s.",
3173                      delay, (delay > 1) ? "s" : "");
3174          } else {
3175             sprintf(delay_str, ".");
3176          }
3177          /* Skip to anything after "URL=" or ";" if "URL=" is not found */
3178          if ((p = dStriAsciiStr(content, "url=")))
3179             content = p + strlen("url=");
3180          else if ((p = strstr(content, ";")))
3181             content = p + strlen(";");
3182          /* Handle the case of a quoted URL */
3183          if (*content == '"' || *content == '\'') {
3184             if ((p = strchr(content + 1, *content)))
3185                mr_url = dStrndup(content + 1, p - content - 1);
3186             else
3187                mr_url = dStrdup(content + 1);
3188          } else {
3189             mr_url = dStrdup(content);
3190          }
3191          new_url = a_Html_url_new(html, mr_url, NULL, 0);
3192 
3193          if (a_Url_cmp(html->base_url, new_url) == 0) {
3194             /* redirection loop, or empty url string: ignore */
3195             BUG_MSG("<meta> refresh: %s.",
3196                     *mr_url ? "redirection loop" : "no target URL");
3197          } else if (delay == 0) {
3198             /* zero-delay redirection */
3199             html->stop_parser = true;
3200             if (URL_FLAGS(html->base_url) & URL_SpamSafe) {
3201                a_UIcmd_set_msg(html->bw,
3202                   "WARNING: local URL with META refresh.  Aborting.");
3203             } else if (a_Capi_dpi_verify_request(html->bw, new_url)) {
3204                a_UIcmd_redirection0((void*)html->bw, new_url);
3205             }
3206          } else {
3207             /* Send a custom HTML message.
3208              * TODO: This is a hairy hack,
3209              *       It'd be much better to build a widget. */
3210             Dstr *ds_msg = dStr_sized_new(256);
3211             dStr_sprintf(ds_msg, meta_template, URL_STR(new_url), delay_str);
3212             {
3213                int o_InFlags = html->InFlags;
3214                int o_TagSoup = html->TagSoup;
3215                html->InFlags = IN_BODY + IN_META_HACK;
3216                html->TagSoup = false;
3217                Html_write_raw(html, ds_msg->str, ds_msg->len, 0);
3218                html->TagSoup = o_TagSoup;
3219                html->InFlags = o_InFlags;
3220             }
3221             dStr_free(ds_msg, 1);
3222          }
3223          a_Url_free(new_url);
3224          dFree(mr_url);
3225 
3226       } else if (!dStrAsciiCasecmp(equiv, "content-type") &&
3227                  (content = a_Html_get_attr(html, tag, tagsize, "content"))) {
3228          _MSG("Html_tag_open_meta: content={%s}\n", content);
3229          Html_update_content_type(html, content);
3230       }
3231    } else if (html->DocType == DT_HTML && html->DocTypeVersion == 5.0f &&
3232               (charset = a_Html_get_attr(html, tag, tagsize, "charset"))) {
3233       char *content = dStrconcat("text/html; charset=", charset, NULL);
3234 
3235       Html_update_content_type(html, content);
3236       dFree(content);
3237    }
3238 }
3239 
3240 /*
3241  * Called by the network engine when a stylesheet has new data.
3242  */
Html_css_load_callback(int Op,CacheClient_t * Client)3243 static void Html_css_load_callback(int Op, CacheClient_t *Client)
3244 {
3245    _MSG("Html_css_load_callback: Op=%d\n", Op);
3246    if (Op) { /* EOF */
3247       BrowserWindow *bw = ((DilloWeb *)Client->Web)->bw;
3248       /* Repush when we've got them all */
3249       if (--bw->NumPendingStyleSheets == 0)
3250          a_UIcmd_repush(bw);
3251    }
3252 }
3253 
3254 /*
3255  * Tell cache to retrieve a stylesheet
3256  */
a_Html_load_stylesheet(DilloHtml * html,DilloUrl * url)3257 void a_Html_load_stylesheet(DilloHtml *html, DilloUrl *url)
3258 {
3259    char *data;
3260    int len;
3261 
3262    dReturn_if (url == NULL || ! prefs.load_stylesheets);
3263 
3264    _MSG("Html_load_stylesheet: ");
3265    if (a_Capi_get_buf(url, &data, &len)) {
3266       _MSG("cached URL=%s len=%d", URL_STR(url), len);
3267       if (a_Capi_get_flags_with_redirection(url) & CAPI_Completed) {
3268          if (strncmp("@charset \"", data, 10) == 0) {
3269             char *endq = strchr(data+10, '"');
3270 
3271             if (endq && (endq - data <= 51)) {
3272                /* IANA limits charset names to 40 characters */
3273                char *content_type;
3274 
3275                *endq = '\0';
3276                content_type = dStrconcat("text/css; charset=", data+10, NULL);
3277                *endq = '"';
3278                a_Capi_unref_buf(url);
3279                a_Capi_set_content_type(url, content_type, "meta");
3280                dFree(content_type);
3281                a_Capi_get_buf(url, &data, &len);
3282             }
3283          }
3284          html->styleEngine->parse(html, url, data, len, CSS_ORIGIN_AUTHOR);
3285       }
3286       a_Capi_unref_buf(url);
3287    } else {
3288       /* Fill a Web structure for the cache query */
3289       int ClientKey;
3290       DilloWeb *Web = a_Web_new(html->bw, url, html->page_url);
3291       Web->flags |= WEB_Stylesheet;
3292       if ((ClientKey = a_Capi_open_url(Web, Html_css_load_callback, NULL))) {
3293          ++html->bw->NumPendingStyleSheets;
3294          a_Bw_add_client(html->bw, ClientKey, 0);
3295          a_Bw_add_url(html->bw, url);
3296          MSG("NumPendingStyleSheets=%d\n", html->bw->NumPendingStyleSheets);
3297       }
3298    }
3299    _MSG("\n");
3300 }
3301 
3302 /*
3303  * Parse the LINK element (Only CSS stylesheets by now).
3304  * (If it either hits or misses, is not relevant here; that's up to the
3305  *  cache functions)
3306  *
3307  * TODO: How will we know when to use "handheld"? Ask the html->bw->ui for
3308  * screen dimensions, or a dillorc preference.
3309  */
Html_tag_open_link(DilloHtml * html,const char * tag,int tagsize)3310 static void Html_tag_open_link(DilloHtml *html, const char *tag, int tagsize)
3311 {
3312    DilloUrl *url;
3313    const char *attrbuf;
3314 
3315    //char *tag_str = dStrndup(tag, tagsize);
3316    //MSG("Html_tag_open_link(): %s\n", tag_str);
3317    //dFree(tag_str);
3318 
3319    /* When viewing suspicious HTML email, don't load LINK */
3320    dReturn_if (URL_FLAGS(html->base_url) & URL_SpamSafe);
3321 
3322    /* Ignore LINK outside HEAD */
3323    if (!(html->InFlags & IN_HEAD)) {
3324       if (!((html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f) &&
3325             a_Html_get_attr(html, tag, tagsize, "itemprop"))) {
3326          /* With the HTML 5.1 draft spec, link with itemprop may appear
3327           * in the body.
3328           */
3329          BUG_MSG("This <link> element must be inside the HEAD section.");
3330       }
3331       return;
3332    }
3333    /* Remote stylesheets enabled? */
3334    dReturn_if_fail (prefs.load_stylesheets);
3335    /* CSS stylesheet link */
3336    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "rel")) ||
3337        dStrAsciiCasecmp(attrbuf, "stylesheet"))
3338       return;
3339 
3340    /* IMPLIED attributes? */
3341    if (((attrbuf = a_Html_get_attr(html, tag, tagsize, "type")) &&
3342         dStrAsciiCasecmp(attrbuf, "text/css")) ||
3343        ((attrbuf = a_Html_get_attr(html, tag, tagsize, "media")) &&
3344         !dStriAsciiStr(attrbuf, "screen") && dStrAsciiCasecmp(attrbuf, "all")))
3345       return;
3346 
3347    if (!(attrbuf = a_Html_get_attr(html, tag, tagsize, "href")) ||
3348        !(url = a_Html_url_new(html, attrbuf, NULL, 0)))
3349       return;
3350 
3351    _MSG("  Html_tag_open_link(): addCssUrl %s\n", URL_STR(url));
3352 
3353    html->addCssUrl(url);
3354    a_Url_free(url);
3355 }
3356 
3357 /*
3358  * Set the Document Base URI
3359  */
Html_tag_open_base(DilloHtml * html,const char * tag,int tagsize)3360 static void Html_tag_open_base(DilloHtml *html, const char *tag, int tagsize)
3361 {
3362    const char *attrbuf;
3363    DilloUrl *BaseUrl;
3364 
3365    if (html->InFlags & IN_HEAD) {
3366       if ((attrbuf = a_Html_get_attr(html, tag, tagsize, "href"))) {
3367          BaseUrl = a_Html_url_new(html, attrbuf, "", 1);
3368          if (URL_SCHEME_(BaseUrl)) {
3369             /* Pass the URL_SpamSafe flag to the new base url */
3370             a_Url_set_flags(
3371                BaseUrl, URL_FLAGS(html->base_url) & URL_SpamSafe);
3372             a_Url_free(html->base_url);
3373             html->base_url = BaseUrl;
3374          } else {
3375             BUG_MSG("<base> URI is relative (it MUST be absolute).");
3376             a_Url_free(BaseUrl);
3377          }
3378       }
3379    } else {
3380       BUG_MSG("<base> not inside HEAD section.");
3381    }
3382 }
3383 
Html_tag_open_default(DilloHtml * html,const char * tag,int tagsize)3384 static void Html_tag_open_default(DilloHtml *html,const char *tag,int tagsize)
3385 {
3386    html->styleEngine->inheritBackgroundColor();
3387 }
3388 
3389 /*
3390  * <SPAN>
3391  */
Html_tag_open_span(DilloHtml * html,const char * tag,int tagsize)3392 static void Html_tag_open_span(DilloHtml *html, const char *tag, int tagsize)
3393 {
3394    const char *attrbuf;
3395 
3396    html->styleEngine->inheritBackgroundColor();
3397 
3398    if (prefs.show_tooltip &&
3399        (attrbuf = a_Html_get_attr(html, tag, tagsize, "title"))) {
3400 
3401       html->styleEngine->setNonCssHint (PROPERTY_X_TOOLTIP, CSS_TYPE_STRING,
3402                                         attrbuf);
3403    }
3404 }
3405 
3406 /*
3407  * html5 sectioning stuff: article aside nav section header footer
3408  */
Html_tag_open_sectioning(DilloHtml * html,const char * tag,int tagsize)3409 static void Html_tag_open_sectioning(DilloHtml *html, const char *tag,
3410                                      int tagsize)
3411 {
3412    const char *attrbuf;
3413 
3414    if (prefs.show_tooltip &&
3415        (attrbuf = a_Html_get_attr(html, tag, tagsize, "title"))) {
3416 
3417       html->styleEngine->setNonCssHint (PROPERTY_X_TOOLTIP, CSS_TYPE_STRING,
3418                                         attrbuf);
3419    }
3420 }
3421 
3422 /*
3423  * <DIV> (TODO: make a complete implementation)
3424  */
Html_tag_open_div(DilloHtml * html,const char * tag,int tagsize)3425 static void Html_tag_open_div(DilloHtml *html, const char *tag, int tagsize)
3426 {
3427    a_Html_tag_set_align_attr (html, tag, tagsize);
3428    Html_tag_open_sectioning(html, tag, tagsize);
3429 }
3430 
3431 /*
3432  * Default close for paragraph tags - pop the stack and break.
3433  */
Html_tag_close_par(DilloHtml * html)3434 static void Html_tag_close_par(DilloHtml *html)
3435 {
3436    HT2TB(html)->addParbreak (9, html->wordStyle ());
3437 }
3438 
3439 /*
3440  * <WBR> "The wbr element represents a line break opportunity."
3441  */
Html_tag_content_wbr(DilloHtml * html,const char * tag,int tagsize)3442 static void Html_tag_content_wbr(DilloHtml *html, const char *tag, int tagsize)
3443 {
3444    HT2TB(html)->addBreakOption(html->wordStyle (), true);
3445 }
3446 
3447 
3448 /*
3449  * Function index for the open, content, and close functions for each tag
3450  * (Alphabetically sorted for a binary search).
3451  * The open and close functions are always called. They are used for style
3452  * handling and HTML bug reporting.
3453  * Content creation (e.g. adding new widgets or text) is done in the content
3454  * function, which is not called in the display:none case.
3455  * Note: many tags don't need a content function (e.g. <div>, <span>, ...).
3456  *
3457  * Explanation for the 'Flags' field:
3458  *
3459  *   {"address", B8(010110), ...}
3460  *                  |||||`- inline element
3461  *                  ||||`-- block element
3462  *                  |||`--- inline container
3463  *                  ||`---- block container
3464  *                  |`----- body element
3465  *                  `------ head element
3466  *
3467  *   Notes:
3468  *     - The upper two bits are not used yet.
3469  *     - Empty elements have both inline and block container clear.
3470  *       (flow have both set)
3471  */
3472 
3473 const TagInfo Tags[] = {
3474  {"a", B8(011101),'R',2, Html_tag_open_a, NULL, Html_tag_close_a},
3475  {"abbr", B8(010101),'R',2, Html_tag_open_abbr, NULL, NULL},
3476  /* acronym 010101 -- obsolete in HTML5 */
3477  {"address", B8(010110),'R',2,Html_tag_open_default, NULL, Html_tag_close_par},
3478  {"area", B8(010001),'F',0, Html_tag_open_default, Html_tag_content_area,
3479                             NULL},
3480  {"article", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3481  {"aside", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3482  {"audio", B8(011101),'R',2, Html_tag_open_audio, NULL, Html_tag_close_media},
3483  {"b", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3484  {"base", B8(100001),'F',0, Html_tag_open_base, NULL, NULL},
3485  /* basefont 010001 -- obsolete in HTML5 */
3486  /* bdo 010101 */
3487  {"big", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3488  {"blockquote", B8(011110),'R',2, Html_tag_open_blockquote, NULL,
3489                                   NULL},
3490  {"body", B8(011110),'O',1, Html_tag_open_body, NULL, Html_tag_close_body},
3491  {"br", B8(010001),'F',0, Html_tag_open_default, Html_tag_content_br,
3492                           NULL},
3493  {"button", B8(011101),'R',2, Html_tag_open_button,NULL,Html_tag_close_button},
3494  /* caption */
3495  {"center", B8(011110),'R',2, Html_tag_open_default, NULL, NULL},
3496  {"cite", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3497  {"code", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3498  /* col 010010 'F' */
3499  /* colgroup */
3500  {"dd", B8(011110),'O',1, Html_tag_open_dd, NULL, NULL},
3501  {"del", B8(011101),'R',2, Html_tag_open_default, NULL, NULL},
3502  {"dfn", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3503  {"dir", B8(011010),'R',2, Html_tag_open_dir, NULL, Html_tag_close_par},
3504  /* TODO: complete <div> support! */
3505  {"div", B8(011110),'R',2, Html_tag_open_div, NULL, NULL},
3506  {"dl", B8(011010),'R',2, Html_tag_open_dl, NULL, Html_tag_close_par},
3507  {"dt", B8(010110),'O',1, Html_tag_open_dt, NULL, Html_tag_close_par},
3508  {"em", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3509  {"embed", B8(010001),'F',0, Html_tag_open_embed, Html_tag_content_embed,NULL},
3510  /* fieldset */
3511  {"figcaption", B8(011110),'R',2, Html_tag_open_default, NULL, NULL},
3512  {"figure", B8(011110),'R',2, Html_tag_open_default, NULL, NULL},
3513  {"font", B8(010101),'R',2, Html_tag_open_font, NULL, NULL},
3514  {"footer", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3515  {"form", B8(011110),'R',2, Html_tag_open_form, NULL, Html_tag_close_form},
3516  {"frame", B8(010010),'F',0, Html_tag_open_frame, Html_tag_content_frame,
3517                              NULL},
3518  {"frameset", B8(011110),'R',2, Html_tag_open_default,
3519                                 Html_tag_content_frameset, NULL},
3520  {"h1", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3521  {"h2", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3522  {"h3", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3523  {"h4", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3524  {"h5", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3525  {"h6", B8(010110),'R',2, Html_tag_open_h, NULL, NULL},
3526  {"head", B8(101101),'O',1, Html_tag_open_head, NULL, Html_tag_close_head},
3527  {"header", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3528  {"hr", B8(010010),'F',0, Html_tag_open_hr, Html_tag_content_hr,
3529                           NULL},
3530  {"html", B8(001110),'O',1, Html_tag_open_html, NULL, Html_tag_close_html},
3531  {"i", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3532  {"iframe", B8(011110),'R',2, Html_tag_open_frame, Html_tag_content_frame,
3533                               NULL},
3534  {"img", B8(010001),'F',0, Html_tag_open_img, Html_tag_content_img,
3535                            NULL},
3536  {"input", B8(010001),'F',0, Html_tag_open_input, NULL, NULL},
3537  {"ins", B8(011101),'R',2, Html_tag_open_default, NULL, NULL},
3538  {"isindex", B8(110001),'F',0, Html_tag_open_isindex, NULL, NULL},
3539  {"kbd", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3540  /* label 010101 */
3541  /* legend 01?? */
3542  {"li", B8(011110),'O',1, Html_tag_open_li, NULL, Html_tag_close_li},
3543  {"link", B8(100001),'F',0, Html_tag_open_link, NULL, NULL},
3544  {"map", B8(011001),'R',2, Html_tag_open_default, Html_tag_content_map,
3545                            Html_tag_close_map},
3546  {"mark", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3547  /* menu 1010 -- TODO: not exactly 1010, it can contain LI and inline */
3548  {"menu", B8(011010),'R',2, Html_tag_open_menu, NULL, Html_tag_close_par},
3549  {"meta", B8(110001),'F',0, Html_tag_open_meta, NULL, NULL},
3550  {"nav", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3551  /* noframes 1011 -- obsolete in HTML5 */
3552  /* noscript 1011 */
3553  {"object", B8(111101),'R',2, Html_tag_open_object, Html_tag_content_object,
3554                               NULL},
3555  {"ol", B8(011010),'R',2, Html_tag_open_ol, NULL, NULL},
3556  {"optgroup", B8(010101),'O',1, Html_tag_open_optgroup, NULL,
3557                                 Html_tag_close_optgroup},
3558  {"option", B8(010001),'O',0, Html_tag_open_option,NULL,Html_tag_close_option},
3559  {"p", B8(010110),'O',1, Html_tag_open_p, NULL, NULL},
3560  /* param 010001 'F' */
3561  {"pre", B8(010110),'R',2, Html_tag_open_pre, NULL, Html_tag_close_pre},
3562  {"q", B8(010101),'R',2, Html_tag_open_q, NULL, Html_tag_close_q},
3563  {"s", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3564  {"samp", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3565  {"script", B8(111001),'R',2, Html_tag_open_script,NULL,Html_tag_close_script},
3566  {"section", B8(011110),'R',2, Html_tag_open_sectioning, NULL, NULL},
3567  {"select", B8(010101),'R',2, Html_tag_open_select,NULL,Html_tag_close_select},
3568  {"small", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3569  {"source", B8(010001),'F',0, Html_tag_open_source, Html_tag_content_source,
3570                               NULL},
3571  {"span", B8(010101),'R',2, Html_tag_open_span, NULL, NULL},
3572  {"strike", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3573  {"strong", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3574  {"style", B8(100101),'R',2, Html_tag_open_style, NULL, Html_tag_close_style},
3575  {"sub", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3576  {"sup", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3577  {"table", B8(011010),'R',5, Html_tag_open_table, Html_tag_content_table,
3578                              NULL},
3579  /* tbody */
3580  {"td", B8(011110),'O',3, Html_tag_open_td, Html_tag_content_td,
3581                           NULL},
3582  {"textarea", B8(010101),'R', 2, Html_tag_open_textarea,
3583                           Html_tag_content_textarea, Html_tag_close_textarea},
3584  /* tfoot */
3585  {"th", B8(011110),'O',1, Html_tag_open_th, Html_tag_content_th,
3586                           NULL},
3587  /* thead */
3588  {"title", B8(100101),'R',2, Html_tag_open_title, NULL, Html_tag_close_title},
3589  {"tr", B8(011010),'O',4, Html_tag_open_tr, Html_tag_content_tr,
3590                           NULL},
3591  {"tt", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3592  {"u", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3593  {"ul", B8(011010),'R',2, Html_tag_open_ul, NULL, NULL},
3594  {"var", B8(010101),'R',2, Html_tag_open_default, NULL, NULL},
3595  {"video", B8(011101),'R',2, Html_tag_open_video, NULL, Html_tag_close_media},
3596  {"wbr", B8(010101),'F',0, Html_tag_open_default, Html_tag_content_wbr, NULL}
3597 };
3598 #define NTAGS (sizeof(Tags)/sizeof(Tags[0]))
3599 
3600 
3601 /*
3602  * Compares tag from buffer ('/' or '>' or space-ended string) [p1]
3603  * with tag from taglist (lowercase, zero ended string) [p2]
3604  * Return value: as strcmp()
3605  */
Html_tag_compare(const char * p1,const char * p2)3606 static int Html_tag_compare(const char *p1, const char *p2)
3607 {
3608    while ( *p2 ) {
3609       if (D_ASCII_TOLOWER(*p1) != *p2)
3610          return(D_ASCII_TOLOWER(*p1) - *p2);
3611       ++p1;
3612       ++p2;
3613    }
3614    return !strchr(" >/\n\r\t", *p1);
3615 }
3616 
3617 /*
3618  * Get 'tag' index
3619  * return -1 if tag is not handled yet
3620  */
a_Html_tag_index(const char * tag)3621 int a_Html_tag_index(const char *tag)
3622 {
3623    int low, high, mid, cond;
3624 
3625    /* Binary search */
3626    low = 0;
3627    high = NTAGS - 1;          /* Last tag index */
3628    while (low <= high) {
3629       mid = (low + high) / 2;
3630       if ((cond = Html_tag_compare(tag, Tags[mid].name)) < 0 )
3631          high = mid - 1;
3632       else if (cond > 0)
3633          low = mid + 1;
3634       else
3635          return mid;
3636    }
3637    return -1;
3638 }
3639 
3640 /*
3641  * For elements with optional close, check whether is time to close.
3642  * Return value: (1: Close, 0: Don't close)
3643  * --tuned for speed.
3644  */
Html_needs_optional_close(int old_idx,int cur_idx)3645 static int Html_needs_optional_close(int old_idx, int cur_idx)
3646 {
3647    static int i_P = -1, i_LI, i_TD, i_TR, i_TH, i_DD, i_DT, i_OPTION;
3648                // i_THEAD, i_TFOOT, i_COLGROUP;
3649 
3650    if (i_P == -1) {
3651     /* initialize the indexes of elements with optional close */
3652     i_P  = a_Html_tag_index("p"),
3653     i_LI = a_Html_tag_index("li"),
3654     i_TD = a_Html_tag_index("td"),
3655     i_TR = a_Html_tag_index("tr"),
3656     i_TH = a_Html_tag_index("th"),
3657     i_DD = a_Html_tag_index("dd"),
3658     i_DT = a_Html_tag_index("dt"),
3659     i_OPTION = a_Html_tag_index("option");
3660     // i_THEAD = a_Html_tag_index("thead");
3661     // i_TFOOT = a_Html_tag_index("tfoot");
3662     // i_COLGROUP = a_Html_tag_index("colgroup");
3663    }
3664 
3665    if (old_idx == i_P || old_idx == i_DT) {
3666       /* P and DT are closed by block elements */
3667       return (Tags[cur_idx].Flags & 2);
3668    } else if (old_idx == i_LI) {
3669       /* LI closes LI */
3670       return (cur_idx == i_LI);
3671    } else if (old_idx == i_TD || old_idx == i_TH) {
3672       /* TD and TH are closed by TD, TH and TR */
3673       return (cur_idx == i_TD || cur_idx == i_TH || cur_idx == i_TR);
3674    } else if (old_idx == i_TR) {
3675       /* TR closes TR */
3676       return (cur_idx == i_TR);
3677    } else if (old_idx ==  i_DD) {
3678       /* DD is closed by DD and DT */
3679       return (cur_idx == i_DD || cur_idx == i_DT);
3680    } else if (old_idx ==  i_OPTION) {
3681       return 1;  // OPTION always needs close
3682    }
3683 
3684    /* HTML, HEAD, BODY are handled by Html_test_section(), not here. */
3685    /* TODO: TBODY is pending */
3686    return 0;
3687 }
3688 
3689 
3690 /*
3691  * Conditional cleanup of the stack (at open time).
3692  * - This helps catching block elements inside inline containers (a BUG).
3693  * - It also closes elements with "optional" close tag.
3694  *
3695  * This function is called when opening a block element or <OPTION>.
3696  *
3697  * It searches the stack closing open inline containers, and closing
3698  * elements with optional close tag when necessary.
3699  *
3700  * Note: OPTION is the only non-block element with an optional close.
3701  */
Html_stack_cleanup_at_open(DilloHtml * html,int new_idx)3702 static void Html_stack_cleanup_at_open(DilloHtml *html, int new_idx)
3703 {
3704    /* We know that the element we're about to push is a block element.
3705     * (except for OPTION, which is an empty inline, so is closed anyway)
3706     * Notes:
3707     *   Its 'tag' is not yet pushed into the stack,
3708     *   'new_idx' is its index inside Tags[].
3709     */
3710 
3711    if (!html->TagSoup)
3712       return;
3713 
3714    while (html->stack->size() > 1) {
3715       int oldtag_idx = S_TOP(html)->tag_idx;
3716 
3717       if (Tags[oldtag_idx].EndTag == 'O') {    // Element with optional close
3718          if (!Html_needs_optional_close(oldtag_idx, new_idx))
3719             break;
3720       } else if (Tags[oldtag_idx].Flags & 8) { // Block container
3721          break;
3722       }
3723 
3724       /* we have an inline (or empty) container... */
3725       if (Tags[oldtag_idx].EndTag == 'R') {
3726          BUG_MSG("<%s> is not allowed to contain <%s>. -- closing <%s>.",
3727                  Tags[oldtag_idx].name, Tags[new_idx].name,
3728                  Tags[oldtag_idx].name);
3729       }
3730 
3731       /* Workaround for Apache and its bad HTML directory listings... */
3732       if ((html->InFlags & IN_PRE) &&
3733           strcmp(Tags[new_idx].name, "hr") == 0)
3734          break;
3735       /* Avoid OPTION closing SELECT */
3736       if ((html->InFlags & IN_SELECT) &&
3737           strcmp(Tags[new_idx].name,"option") == 0)
3738          break;
3739 
3740       /* This call closes the top tag only. */
3741       Html_tag_cleanup_at_close(html, oldtag_idx);
3742    }
3743 }
3744 
3745 /*
3746  * HTML, HEAD and BODY elements have optional open and close tags.
3747  * Handle this "magic" here.
3748  */
Html_test_section(DilloHtml * html,int new_idx,int IsCloseTag)3749 static void Html_test_section(DilloHtml *html, int new_idx, int IsCloseTag)
3750 {
3751    const char *tag;
3752    int tag_idx;
3753 
3754    if (!(html->InFlags & IN_HTML) && html->DocType == DT_NONE)
3755       BUG_MSG("The required DOCTYPE declaration is missing. "
3756               "Handling as HTML4.");
3757 
3758    if (!(html->InFlags & IN_HTML)) {
3759       tag = "<html>";
3760       tag_idx = a_Html_tag_index(tag + 1);
3761       if (tag_idx != new_idx || IsCloseTag) {
3762          /* implicit open */
3763          Html_force_push_tag(html, tag_idx);
3764          _MSG("Open : %*s%s\n", html->stack->size()," ",Tags[tag_idx].name);
3765          Tags[tag_idx].open (html, tag, strlen(tag));
3766       }
3767    }
3768 
3769    if (Tags[new_idx].Flags & 32) {
3770       /* head element */
3771       if (!(html->InFlags & IN_HEAD) && html->Num_HEAD == 0) {
3772          tag = "<head>";
3773          tag_idx = a_Html_tag_index(tag + 1);
3774          if (tag_idx != new_idx || IsCloseTag) {
3775             /* implicit open of the head element */
3776             Html_force_push_tag(html, tag_idx);
3777             _MSG("Open : %*s%s\n", html->stack->size()," ",Tags[tag_idx].name);
3778             Tags[tag_idx].open (html, tag, strlen(tag));
3779          }
3780       }
3781 
3782    } else if (Tags[new_idx].Flags & 16) {
3783       /* body element */
3784       if (html->InFlags & IN_HEAD) {
3785          tag = "</head>";
3786          tag_idx = a_Html_tag_index(tag + 2);
3787          Html_tag_cleanup_at_close(html, tag_idx);
3788       }
3789       tag = "<body>";
3790       tag_idx = a_Html_tag_index(tag + 1);
3791       if (tag_idx != new_idx || IsCloseTag) {
3792          /* implicit open */
3793          Html_force_push_tag(html, tag_idx);
3794          _MSG("Open : %*s%s\n", html->stack->size()," ",Tags[tag_idx].name);
3795          Tags[tag_idx].open (html, tag, strlen(tag));
3796       }
3797    }
3798 }
3799 
3800 /*
3801  * Parse attributes that can appear on any tag.
3802  */
Html_parse_common_attrs(DilloHtml * html,char * tag,int tagsize)3803 static void Html_parse_common_attrs(DilloHtml *html, char *tag, int tagsize)
3804 {
3805    const char *attrbuf;
3806    char lang[3];
3807 
3808    if (tagsize >= 8 &&        /* length of "<t id=i>" */
3809        (attrbuf = a_Html_get_attr(html, tag, tagsize, "id"))) {
3810       /* According to the SGML declaration of HTML 4, all NAME values
3811        * occuring outside entities must be converted to uppercase
3812        * (this is what "NAMECASE GENERAL YES" says). But the HTML 4
3813        * spec states in Sec. 7.5.2 that anchor ids are case-sensitive.
3814        * So we don't do it and hope for better specs in the future ...
3815        */
3816       Html_check_name_val(html, attrbuf, "id");
3817 
3818       html->styleEngine->setId(attrbuf);
3819    }
3820 
3821    if (tagsize >= 11 && (prefs.parse_embedded_css || prefs.load_stylesheets)) {
3822       /* length of "<t class=i>" or "<t style=i>" */
3823       attrbuf = a_Html_get_attr(html, tag, tagsize, "class");
3824       if (attrbuf)
3825          html->styleEngine->setClass (attrbuf);
3826 
3827       attrbuf = a_Html_get_attr(html, tag, tagsize, "style");
3828       if (attrbuf)
3829          html->styleEngine->setStyle (attrbuf);
3830    }
3831 
3832    /* handle "xml:lang" and "lang" attributes
3833     * We use only the first two chars of the value to deal with
3834     * extended language tags (see http://www.rfc-editor.org/rfc/bcp/bcp47.txt)
3835     */
3836    memset(lang, 0, sizeof(lang));
3837    if (tagsize >= 14) {
3838       /* length of "<t xml:lang=i>" */
3839       attrbuf = a_Html_get_attr(html, tag, tagsize, "xml:lang");
3840       if (attrbuf)
3841          strncpy(lang, attrbuf, 2);
3842    }
3843    if (!lang[0] && tagsize >= 10) { /* 'xml:lang' prevails over 'lang' */
3844       /* length of "<t lang=i>" */
3845       attrbuf = a_Html_get_attr(html, tag, tagsize, "lang");
3846       if (attrbuf)
3847          strncpy(lang, attrbuf, 2);
3848    }
3849    if (lang[0])
3850       html->styleEngine->setNonCssHint(PROPERTY_X_LANG, CSS_TYPE_STRING, lang);
3851 }
3852 
3853 /*
3854  * Warn when encountering elements that are obsolete in HTML5. This list
3855  * was from the "W3C Candidate Recommendation 6 August 2013".
3856  */
Html_check_html5_obsolete(DilloHtml * html,int ni)3857 static void Html_check_html5_obsolete(DilloHtml *html, int ni)
3858 {
3859    static int indexes[9] = {-1};
3860 
3861    if (indexes[0] == -1) {
3862       indexes[0] = a_Html_tag_index("dir");
3863       indexes[1] = a_Html_tag_index("frame");
3864       indexes[2] = a_Html_tag_index("frameset");
3865       indexes[3] = a_Html_tag_index("isindex");
3866       indexes[4] = a_Html_tag_index("strike");
3867       indexes[5] = a_Html_tag_index("big");
3868       indexes[6] = a_Html_tag_index("center");
3869       indexes[7] = a_Html_tag_index("font");
3870       indexes[8] = a_Html_tag_index("tt");
3871    }
3872    for (int i = 0; i < 9; i++) {
3873       if (indexes[i] == ni) {
3874          BUG_MSG("<%s> is obsolete in HTML5.", Tags[ni].name);
3875          break;
3876       }
3877    }
3878 }
3879 
Html_display_block(DilloHtml * html)3880 static void Html_display_block(DilloHtml *html)
3881 {
3882    //HT2TB(html)->addParbreak (5, html->styleEngine->wordStyle ());
3883    Html_add_textblock(html, 0);
3884 }
3885 
Html_display_listitem(DilloHtml * html)3886 static void Html_display_listitem(DilloHtml *html)
3887 {
3888    Style *style = html->style ();
3889    Style *wordStyle = html->wordStyle ();
3890    Widget **ref_list_item;
3891    ListItem *list_item;
3892    int *list_number;
3893    char buf[16];
3894 
3895    /* Get our parent tag's variables (used as state storage) */
3896    list_number = &html->stack->getRef(html->stack->size()-2)->list_number;
3897    ref_list_item = &html->stack->getRef(html->stack->size()-2)->ref_list_item;
3898 
3899    HT2TB(html)->addParbreak (0, wordStyle);
3900 
3901    list_item = new ListItem ((ListItem*)*ref_list_item,prefs.limit_text_width);
3902    HT2TB(html)->addWidget (list_item, style);
3903    HT2TB(html)->addParbreak (0, wordStyle);
3904    *ref_list_item = list_item;
3905    S_TOP(html)->textblock = html->dw = list_item;
3906 
3907    if (style->listStyleType == LIST_STYLE_TYPE_NONE) {
3908       // none
3909    } else if (style->listStyleType >= LIST_STYLE_TYPE_DECIMAL) {
3910       // ordered
3911       numtostr((*list_number)++, buf, 16, style->listStyleType);
3912       list_item->initWithText (buf, wordStyle);
3913    } else {
3914       // unordered
3915       list_item->initWithWidget (new Bullet(), wordStyle);
3916    }
3917 }
3918 
3919 /*
3920  * Process a tag, given as 'tag' and 'tagsize'. -- tagsize is [1 based]
3921  * ('tag' must include the enclosing angle brackets)
3922  * This function calls the right open or close function for the tag.
3923  */
Html_process_tag(DilloHtml * html,char * tag,int tagsize)3924 static void Html_process_tag(DilloHtml *html, char *tag, int tagsize)
3925 {
3926    int ci, ni;           /* current and new tag indexes */
3927    char *start = tag + 1; /* discard the '<' */
3928    int IsCloseTag = (*start == '/');
3929 
3930    dReturn_if (html->stop_parser == true);
3931 
3932    ni = a_Html_tag_index(start + IsCloseTag);
3933    if (ni == -1) {
3934       /* TODO: doctype parsing is a bit fuzzy, but enough for the time being */
3935       if (!(html->InFlags & IN_HTML)) {
3936          if (tagsize > 9 && !dStrnAsciiCasecmp(tag, "<!doctype", 9))
3937             Html_parse_doctype(html, tag, tagsize);
3938       }
3939       /* Ignore unknown tags */
3940       return;
3941    }
3942 
3943    if (!IsCloseTag && html->DocType == DT_HTML && html->DocTypeVersion >= 5.0f)
3944       Html_check_html5_obsolete(html, ni);
3945 
3946    /* Handle HTML, HEAD and BODY. Elements with optional open and close */
3947    if (!(html->InFlags & IN_BODY) /* && parsing HTML */)
3948       Html_test_section(html, ni, IsCloseTag);
3949 
3950    /* Tag processing */
3951    ci = S_TOP(html)->tag_idx;
3952    switch (IsCloseTag) {
3953    case 0:
3954       /* Open function */
3955 
3956       /* Cleanup when opening a block element, or
3957        * when openning over an element with optional close */
3958       if (Tags[ni].Flags & 2 || (ci != -1 && Tags[ci].EndTag == 'O'))
3959          Html_stack_cleanup_at_open(html, ni);
3960 
3961       /* TODO: this is only raising a warning, take some defined action.
3962        * Note: apache uses IMG inside PRE (we could use its "alt"). */
3963       if ((html->InFlags & IN_PRE) && Html_tag_pre_excludes(html, ni))
3964          BUG_MSG("<pre> is not allowed to contain <%s>.", Tags[ni].name);
3965 
3966       /* Make sure these elements don't nest each other */
3967       if (html->InFlags & (IN_BUTTON | IN_SELECT | IN_TEXTAREA))
3968          Html_tag_cleanup_nested_inputs(html, ni);
3969 
3970       /* Push the tag into the stack */
3971       Html_push_tag(html, ni);
3972 
3973       html->startElement (ni);
3974       _MSG("Open : %*s%s\n", html->stack->size(), " ", Tags[ni].name);
3975 
3976       /* Parse attributes that can appear on any tag */
3977       Html_parse_common_attrs(html, tag, tagsize);
3978 
3979       /* Call the open function for this tag */
3980       _MSG("Html_process_tag Open : %s\n", Tags[ni].name);
3981       Tags[ni].open (html, tag, tagsize);
3982 
3983       if (! S_TOP(html)->display_none) {
3984          switch (html->style ()->display) {
3985             case DISPLAY_BLOCK:
3986                Html_display_block(html);
3987                break;
3988             case DISPLAY_LIST_ITEM:
3989                Html_display_listitem(html);
3990                break;
3991             case DISPLAY_NONE:
3992                S_TOP(html)->display_none = true;
3993                break;
3994             case DISPLAY_INLINE:
3995             case DISPLAY_INLINE_BLOCK: // TODO: implement inline-block
3996             default:
3997                break;
3998          }
3999 
4000          if (Tags[ni].content && ! S_TOP(html)->display_none) {
4001             Tags[ni].content (html, tag, tagsize);
4002          }
4003       }
4004 
4005       if (html->stop_parser)
4006          break;
4007 
4008       if (S_TOP(html)->parse_mode == DILLO_HTML_PARSE_MODE_VERBATIM) {
4009          /* don't change anything */
4010       } else if (S_TOP(html)->parse_mode != DILLO_HTML_PARSE_MODE_PRE &&
4011           (html->style ()->whiteSpace == WHITE_SPACE_PRE ||
4012            html->style ()->whiteSpace == WHITE_SPACE_PRE_WRAP)) {
4013          S_TOP(html)->parse_mode = DILLO_HTML_PARSE_MODE_PRE;
4014          html->pre_column = 0;
4015          html->PreFirstChar = true;
4016       }
4017 
4018       if (html->styleEngine->getId ())
4019          Html_add_anchor(html, html->styleEngine->getId ());
4020 
4021       /* Request immediate close for elements with forbidden close tag. */
4022       /* TODO: XHTML always requires close tags. A simple implementation
4023        * of the commented clause below will make it work. */
4024       if (/* parsing HTML && */ Tags[ni].EndTag == 'F')
4025          html->ReqTagClose = true;
4026 
4027       /* Don't break! Open tags may also close themselves */
4028 
4029    default:
4030       /* Close function */
4031 
4032       /* Test for </x>, ReqTagClose, <x /> and <x/> */
4033       if (*start == '/' ||                                      /* </x>    */
4034           html->ReqTagClose ||                                  /* request */
4035           (tag[tagsize-2] == '/' &&                             /* XML:    */
4036            (strchr(" \"'", tag[tagsize-3]) ||                   /* [ "']/> */
4037             (size_t)tagsize == strlen(Tags[ni].name) + 3))) {   /*  <x/>   */
4038 
4039          _MSG("Html_process_tag Close: %s\n", Tags[ni].name);
4040          Html_tag_cleanup_at_close(html, ni);
4041          /* This was a close tag */
4042          html->ReqTagClose = false;
4043       }
4044    }
4045 }
4046 
4047 /*
4048  * Get attribute value for 'attrname' and return it.
4049  *  Tags start with '<' and end with a '>' (Ex: "<P align=center>")
4050  *  tagsize = strlen(tag) from '<' to '>', inclusive.
4051  *
4052  * Returns one of the following:
4053  *    * The value of the attribute.
4054  *    * An empty string if the attribute exists but has no value.
4055  *    * NULL if the attribute doesn't exist.
4056  */
Html_get_attr2(DilloHtml * html,const char * tag,int tagsize,const char * attrname,int tag_parsing_flags)4057 static const char *Html_get_attr2(DilloHtml *html,
4058                                   const char *tag,
4059                                   int tagsize,
4060                                   const char *attrname,
4061                                   int tag_parsing_flags)
4062 {
4063    int i, isocode, entsize, Found = 0, delimiter = 0, attr_pos = 0;
4064    Dstr *Buf = html->attr_data;
4065    DilloHtmlTagParsingState state = SEEK_ATTR_START;
4066 
4067    dReturn_val_if_fail(*attrname, NULL);
4068 
4069    dStr_truncate(Buf, 0);
4070 
4071    for (i = 1; i < tagsize; ++i) {
4072       switch (state) {
4073       case SEEK_ATTR_START:
4074          if (isspace(tag[i]))
4075             state = SEEK_TOKEN_START;
4076          else if (tag[i] == '=')
4077             state = SEEK_VALUE_START;
4078          break;
4079 
4080       case MATCH_ATTR_NAME:
4081          if (!attrname[attr_pos] &&
4082              (tag[i] == '=' || isspace(tag[i]) || tag[i] == '>')) {
4083             Found = 1;
4084             state = SEEK_TOKEN_START;
4085             --i;
4086          } else if (!tag[i]) {
4087             state = SEEK_ATTR_START; // NULL byte is not allowed
4088          } else {
4089             if (D_ASCII_TOLOWER(tag[i]) != D_ASCII_TOLOWER(attrname[attr_pos]))
4090                state = SEEK_ATTR_START;
4091             attr_pos++;
4092          }
4093          break;
4094 
4095       case SEEK_TOKEN_START:
4096          if (tag[i] == '=') {
4097             state = SEEK_VALUE_START;
4098          } else if (!isspace(tag[i])) {
4099             attr_pos = 0;
4100             state = (Found) ? FINISHED : MATCH_ATTR_NAME;
4101             --i;
4102          }
4103          break;
4104       case SEEK_VALUE_START:
4105          if (!isspace(tag[i])) {
4106             delimiter = (tag[i] == '"' || tag[i] == '\'') ? tag[i] : ' ';
4107             i -= (delimiter == ' ');
4108             state = (Found) ? GET_VALUE : SKIP_VALUE;
4109          }
4110          break;
4111 
4112       case SKIP_VALUE:
4113          if ((delimiter == ' ' && isspace(tag[i])) || tag[i] == delimiter)
4114             state = SEEK_TOKEN_START;
4115          break;
4116       case GET_VALUE:
4117          if ((delimiter == ' ' && (isspace(tag[i]) || tag[i] == '>')) ||
4118              tag[i] == delimiter) {
4119             state = FINISHED;
4120          } else if (tag[i] == '&' &&
4121                     (tag_parsing_flags & HTML_ParseEntities)) {
4122             if ((isocode = Html_parse_entity(html, tag+i,
4123                                              tagsize-i, &entsize)) >= 0) {
4124                if (isocode >= 128) {
4125                   char buf[4];
4126                   int k, n = a_Utf8_encode(isocode, buf);
4127                   for (k = 0; k < n; ++k)
4128                      dStr_append_c(Buf, buf[k]);
4129                } else {
4130                   dStr_append_c(Buf, (char) isocode);
4131                }
4132                i += entsize-1;
4133             } else {
4134                dStr_append_c(Buf, tag[i]);
4135             }
4136          } else if (tag[i] == '\r' || tag[i] == '\t') {
4137             dStr_append_c(Buf, ' ');
4138          } else if (tag[i] == '\n') {
4139             /* ignore */
4140          } else {
4141             dStr_append_c(Buf, tag[i]);
4142          }
4143          break;
4144 
4145       case FINISHED:
4146          i = tagsize;
4147          break;
4148       }
4149    }
4150 
4151    if (tag_parsing_flags & HTML_LeftTrim)
4152       while (isspace(Buf->str[0]))
4153          dStr_erase(Buf, 0, 1);
4154    if (tag_parsing_flags & HTML_RightTrim)
4155       while (Buf->len && isspace(Buf->str[Buf->len - 1]))
4156          dStr_truncate(Buf, Buf->len - 1);
4157 
4158    return (Found) ? Buf->str : NULL;
4159 }
4160 
4161 /*
4162  * Call Html_get_attr2 telling it to parse entities and strip the result
4163  */
a_Html_get_attr(DilloHtml * html,const char * tag,int tagsize,const char * attrname)4164 const char *a_Html_get_attr(DilloHtml *html,
4165                             const char *tag,
4166                             int tagsize,
4167                             const char *attrname)
4168 {
4169    return Html_get_attr2(html, tag, tagsize, attrname,
4170                          HTML_LeftTrim | HTML_RightTrim | HTML_ParseEntities);
4171 }
4172 
4173 /*
4174  * "a_Html_get_attr with default"
4175  * Call a_Html_get_attr() and dStrdup() the returned string.
4176  * If the attribute isn't found a copy of 'def' is returned.
4177  */
a_Html_get_attr_wdef(DilloHtml * html,const char * tag,int tagsize,const char * attrname,const char * def)4178 char *a_Html_get_attr_wdef(DilloHtml *html,
4179                            const char *tag,
4180                            int tagsize,
4181                            const char *attrname,
4182                            const char *def)
4183 {
4184    const char *attrbuf = a_Html_get_attr(html, tag, tagsize, attrname);
4185 
4186    return attrbuf ? dStrdup(attrbuf) : dStrdup(def);
4187 }
4188 
4189 /*
4190  * Dispatch the apropriate function for 'Op'
4191  * This function is a Cache client and gets called whenever new data arrives
4192  *  Op      : operation to perform.
4193  *  CbData  : a pointer to a DilloHtml structure
4194  *  Buf     : a pointer to new data
4195  *  BufSize : new data size (in bytes)
4196  */
Html_callback(int Op,CacheClient_t * Client)4197 static void Html_callback(int Op, CacheClient_t *Client)
4198 {
4199    DilloHtml *html = (DilloHtml*)Client->CbData;
4200 
4201    if (Op) { /* EOF */
4202       html->write((char*)Client->Buf, Client->BufSize, 1);
4203       html->finishParsing(Client->Key);
4204    } else {
4205       html->write((char*)Client->Buf, Client->BufSize, 0);
4206    }
4207 }
4208 
4209 /*
4210  * Here's where we parse the html and put it into the Textblock structure.
4211  * Return value: number of bytes parsed
4212  */
Html_write_raw(DilloHtml * html,char * buf,int bufsize,int Eof)4213 static int Html_write_raw(DilloHtml *html, char *buf, int bufsize, int Eof)
4214 {
4215    char ch = 0, *p, *text;
4216    int token_start, buf_index;
4217 
4218    /* Now, 'buf' and 'bufsize' define a buffer aligned to start at a token
4219     * boundary. Iterate through tokens until end of buffer is reached. */
4220    buf_index = 0;
4221    token_start = buf_index;
4222    while ((buf_index < bufsize) && !html->stop_parser) {
4223       /* invariant: buf_index == bufsize || token_start == buf_index */
4224 
4225       if (S_TOP(html)->parse_mode ==
4226           DILLO_HTML_PARSE_MODE_VERBATIM) {
4227          /* Non HTML code here, let's skip until closing tag */
4228          do {
4229             const char *tag = Tags[S_TOP(html)->tag_idx].name;
4230             buf_index += strcspn(buf + buf_index, "<");
4231             if (buf_index + (int)strlen(tag) + 3 > bufsize) {
4232                buf_index = bufsize;
4233             } else if (strncmp(buf + buf_index, "</", 2) == 0 &&
4234                        Html_match_tag(tag, buf+buf_index+2, strlen(tag)+1)) {
4235                /* copy VERBATIM text into the stash buffer */
4236                text = dStrndup(buf + token_start, buf_index - token_start);
4237                dStr_append(html->Stash, text);
4238                dFree(text);
4239                token_start = buf_index;
4240                break;
4241             } else
4242                ++buf_index;
4243          } while (buf_index < bufsize);
4244 
4245          if (buf_index == bufsize)
4246             break;
4247       }
4248 
4249       if (isspace(buf[buf_index])) {
4250          /* whitespace: group all available whitespace */
4251          while (++buf_index < bufsize && isspace(buf[buf_index])) ;
4252          Html_process_space(html, buf + token_start, buf_index - token_start);
4253          token_start = buf_index;
4254 
4255       } else if (buf[buf_index] == '<' && (ch = buf[buf_index + 1]) &&
4256                  (isalpha(ch) || strchr("/!?", ch)) ) {
4257          /* Tag */
4258          if (buf_index + 3 < bufsize && !strncmp(buf + buf_index, "<!--", 4)) {
4259             /* Comment: search for close of comment, skipping over
4260              * everything except a matching "-->" tag. */
4261             while ( (p = (char*) memchr(buf + buf_index, '>',
4262                                         bufsize - buf_index)) ){
4263                buf_index = p - buf + 1;
4264                if (p[-1] == '-' && p[-2] == '-') break;
4265             }
4266             if (p) {
4267                /* Got the whole comment. Let's throw it away! :) */
4268                token_start = buf_index;
4269             } else
4270                buf_index = bufsize;
4271          } else {
4272             /* Tag: search end of tag (skipping over quoted strings) */
4273             html->CurrOfs = html->Start_Ofs + token_start;
4274 
4275             while ( buf_index < bufsize ) {
4276                buf_index++;
4277                buf_index += strcspn(buf + buf_index, ">\"'<");
4278                if ((ch = buf[buf_index]) == '>') {
4279                   break;
4280                } else if (ch == '"' || ch == '\'') {
4281                   /* Skip over quoted string */
4282                   buf_index++;
4283                   buf_index += strcspn(buf + buf_index,
4284                                        (ch == '"') ? "\">" : "'>");
4285                   if (buf[buf_index] == '>') {
4286                      /* Unterminated string value? Let's look ahead and test:
4287                       * (<: unterminated, closing-quote: terminated) */
4288                      int offset = buf_index + 1;
4289                      offset += strcspn(buf + offset,
4290                                        (ch == '"') ? "\"<" : "'<");
4291                      if (buf[offset] == ch || !buf[offset]) {
4292                         buf_index = offset;
4293                      } else {
4294                         BUG_MSG("Attribute lacks closing quote.");
4295                         break;
4296                      }
4297                   }
4298                } else if (ch == '<') {
4299                   /* unterminated tag detected */
4300                   p = dStrndup(buf+token_start+1,
4301                                strcspn(buf+token_start+1, " <\n\r\t"));
4302                   BUG_MSG("<%s> lacks its closing '>'.", p);
4303                   dFree(p);
4304                   --buf_index;
4305                   break;
4306                }
4307             }
4308             if (buf_index < bufsize) {
4309                buf_index++;
4310                Html_process_tag(html, buf + token_start,
4311                                 buf_index - token_start);
4312                token_start = buf_index;
4313             }
4314          }
4315       } else {
4316          /* A Word: search for whitespace or tag open */
4317          html->CurrOfs = html->Start_Ofs + token_start;
4318 
4319          while (++buf_index < bufsize) {
4320             buf_index += strcspn(buf + buf_index, " <\n\r\t\f\v");
4321             if (buf[buf_index] == '<' && (ch = buf[buf_index + 1]) &&
4322                 !isalpha(ch) && !strchr("/!?", ch))
4323                continue;
4324             break;
4325          }
4326          if (buf_index < bufsize || Eof) {
4327             /* successfully found end of token */
4328             ch = buf[buf_index];
4329             buf[buf_index] = 0;
4330             Html_process_word(html, buf + token_start,
4331                               buf_index - token_start);
4332             buf[buf_index] = ch;
4333             token_start = buf_index;
4334          }
4335       }
4336    }/*while*/
4337 
4338    HT2TB(html)->flush ();
4339 
4340    return token_start;
4341 }
4342 
4343 
4344