1 /*
2  * html_read.c
3  *
4  * Copyright (C) 1999, 2000 Rasca, Berlin
5  * EMail: thron@gmx.de
6  * Copyright (c) 2001 Andreas J. Guelzow
7  * EMail: aguelzow@taliesin.ca
8  * Copyright (c) 2002 Jody Goldberg
9  * EMail: jody@gnome.org
10  *
11  * Contributors :
12  *   Almer S. Tigelaar <almer1@dds.nl>
13  *   Andreas J. Guelzow <aguelzow@taliesin.ca>
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  *
20  * This program is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23  * GNU General Public License for more details.
24  *
25  * You should have received a copy of the GNU General Public License
26  * along with this program; if not, see <https://www.gnu.org/licenses/>.
27  */
28 
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
31 #include <gnumeric.h>
32 #include <string.h>
33 #include "html.h"
34 
35 #include <sheet-object-cell-comment.h>
36 #include <workbook-view.h>
37 #include <workbook.h>
38 #include <sheet.h>
39 #include <sheet-merge.h>
40 #include <sheet-style.h>
41 #include <style.h>
42 #include <style-color.h>
43 #include <hlink.h>
44 #include <cell.h>
45 #include <ranges.h>
46 #include <goffice/goffice.h>
47 
48 #include <gsf/gsf-input.h>
49 #include <libxml/HTMLparser.h>
50 #include <libxml/HTMLtree.h>
51 
52 #define CC2XML(s) ((xmlChar const *)(s))
53 #define C2XML(s) ((xmlChar *)(s))
54 #define CXML2C(s) ((char const *)(s))
55 #define XML2C(s) ((char *)(s))
56 
57 typedef struct {
58 	Sheet *sheet;
59 	int   row;
60 	WorkbookView *wb_view;
61 } GnmHtmlTableCtxt;
62 
63 static void html_read_table (htmlNodePtr cur, htmlDocPtr doc,
64 			     WorkbookView *wb_view,
65 			     GnmHtmlTableCtxt *tc);
66 
67 
68 static Sheet *
html_get_sheet(char const * name,Workbook * wb)69 html_get_sheet (char const *name, Workbook *wb)
70 {
71 	Sheet *sheet = NULL;
72 
73 	if (name) {
74 		sheet = workbook_sheet_by_name (wb, name);
75 		if (sheet == NULL) {
76 			sheet = sheet_new (wb, name, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
77 			workbook_sheet_attach (wb, sheet);
78 		}
79 	} else
80 		sheet = workbook_sheet_add (wb, -1, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
81 	return sheet;
82 }
83 
84 static void
html_append_text(GString * buf,const xmlChar * text)85 html_append_text (GString *buf, const xmlChar *text)
86 {
87 	const xmlChar *p;
88 
89 	while (*text) {
90 		while (g_unichar_isspace (g_utf8_get_char (text)))
91 			text = g_utf8_next_char (text);
92 		if (*text) {
93 			for (p = text;
94 			     *p && !g_unichar_isspace (g_utf8_get_char (p));
95 			     p =  g_utf8_next_char (p))
96 				;
97 			if (buf->len > 0)
98 				g_string_append_c (buf, ' ');
99 			g_string_append_len (buf, text, p - text);
100 			text = p;
101 		}
102 	}
103 }
104 
105 static void
html_read_content(htmlNodePtr cur,GString * buf,GnmStyle * mstyle,xmlBufferPtr a_buf,GSList ** hrefs,gboolean first,htmlDocPtr doc,GnmHtmlTableCtxt * tc)106 html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
107 		   xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
108 		   htmlDocPtr doc, GnmHtmlTableCtxt *tc)
109 {
110 	htmlNodePtr ptr;
111 
112 	for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
113 		if (ptr->type == XML_TEXT_NODE) {
114 			if (g_utf8_validate (ptr->content, -1, NULL))
115 				html_append_text (buf, ptr->content);
116 			else
117 				g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
118 		} else if (ptr->type == XML_ELEMENT_NODE) {
119 			if (first) {
120 				if (xmlStrEqual (ptr->name, CC2XML ("i"))
121 				    || xmlStrEqual (ptr->name, CC2XML ("em")))
122 					gnm_style_set_font_italic (mstyle, TRUE);
123 				if (xmlStrEqual (ptr->name, CC2XML ("b")))
124 					gnm_style_set_font_bold (mstyle, TRUE);
125 			}
126 			if (xmlStrEqual (ptr->name, CC2XML ("a"))) {
127 				xmlAttrPtr   props;
128 				props = ptr->properties;
129 				while (props) {
130 					if (xmlStrEqual (props->name, CC2XML ("href")) && props->children) {
131 						*hrefs = g_slist_prepend (
132 							*hrefs, props->children);
133 
134 					}
135 					props = props->next;
136 				}
137 			}
138 			if (xmlStrEqual (ptr->name, CC2XML ("img"))) {
139 				xmlAttrPtr   props;
140 				props = ptr->properties;
141 				while (props) {
142 					if (xmlStrEqual (props->name, CC2XML ("src")) && props->children) {
143 						htmlNodeDump (a_buf, doc, props->children);
144 						xmlBufferAdd (a_buf, CC2XML ("\n"), -1);
145 					}
146 					props = props->next;
147 				}
148 			}
149 			if (xmlStrEqual (ptr->name, CC2XML ("table"))) {
150 				Sheet *last_sheet = tc->sheet;
151 				int   last_row = tc->row;
152 				tc->sheet = NULL;
153 				tc->row   = -1;
154 				html_read_table (ptr, doc, tc->wb_view, tc);
155 				if (tc->sheet) {
156 					g_string_append_printf (buf, _("[see sheet %s]"), tc->sheet->name_quoted);
157 					xmlBufferAdd (a_buf, CC2XML (_("The original html file is\n"
158 								       "using nested tables.")), -1);
159 				}
160 				tc->sheet = last_sheet;
161 				tc->row = last_row;
162 			} else
163 				html_read_content
164 					(ptr, buf, mstyle, a_buf, hrefs, first, doc, tc);
165 		}
166 		first = FALSE;
167 	}
168 }
169 
170 static void
html_read_row(htmlNodePtr cur,htmlDocPtr doc,GnmHtmlTableCtxt * tc)171 html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
172 {
173 	htmlNodePtr ptr;
174 	int col = -1;
175 
176 	for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
177 		if (xmlStrEqual (ptr->name, CC2XML ("td")) ||
178 		    xmlStrEqual (ptr->name, CC2XML ("th"))) {
179 			GString *buf;
180 			xmlBufferPtr a_buf;
181 			xmlAttrPtr   props;
182 			int colspan = 1;
183 			int rowspan = 1;
184 			GnmCellPos pos;
185 			GnmStyle *mstyle;
186 			GSList *hrefs = NULL;
187 			GnmHLink *lnk = NULL;
188 
189 			/* Check whether we need to skip merges from above */
190 			pos.row = tc->row;
191 			pos.col = col + 1;
192 			while (gnm_sheet_merge_contains_pos (tc->sheet, &pos)) {
193 				col++;
194 				pos.col++;
195 			}
196 
197 			/* Do we span across multiple rows or cols? */
198 			props = ptr->properties;
199 			while (props) {
200 				if (xmlStrEqual (props->name, CC2XML ("colspan")) && props->children)
201 				    colspan = atoi (CXML2C (props->children->content));
202 				if (xmlStrEqual (props->name, CC2XML ("rowspan")) && props->children)
203 				    rowspan = atoi (CXML2C (props->children->content));
204 				props = props->next;
205 			}
206 			if (colspan < 1)
207 				colspan = 1;
208 			if (rowspan < 1)
209 				rowspan = 1;
210 
211 			/* Let's figure out the content of the cell */
212 			buf = g_string_new (NULL);
213 			a_buf = xmlBufferCreate ();
214 
215 			mstyle = gnm_style_new_default ();
216 			if (xmlStrEqual (ptr->name, CC2XML ("th")))
217 				gnm_style_set_font_bold (mstyle, TRUE);
218 
219 			html_read_content (ptr, buf, mstyle, a_buf,
220 					   &hrefs, TRUE, doc, tc);
221 
222 
223 			if (g_slist_length (hrefs) >= 1 &&
224 			    buf->len > 0) {
225 				/* One hyperlink, and text to make it
226 				 * visible */
227 				char *url;
228 				xmlBufferPtr h_buf = xmlBufferCreate ();
229 
230 				hrefs = g_slist_reverse (hrefs);
231 				htmlNodeDump (
232 					h_buf, doc, (htmlNodePtr)hrefs->data);
233 				url = g_strndup (
234 					CXML2C (h_buf->content), h_buf->use);
235 				if (strncmp (url, "mailto:",
236 					     strlen ("mailto:")) == 0)
237 					lnk = gnm_hlink_new (
238 						gnm_hlink_email_get_type (),
239 						tc->sheet);
240 				else
241 					lnk = gnm_hlink_new (
242 						gnm_hlink_url_get_type (),
243 						tc->sheet);
244 				gnm_hlink_set_target (lnk, url);
245 				gnm_style_set_hlink (mstyle, lnk);
246 				gnm_style_set_font_uline (mstyle,
247 							  UNDERLINE_SINGLE);
248 				gnm_style_set_font_color (mstyle,
249 							  gnm_color_new_go (GO_COLOR_BLUE));
250 				g_free (url);
251 				xmlBufferFree (h_buf);
252 			}
253 			if (g_slist_length (hrefs) > 1 || buf->len <= 0) {
254 				/* Multiple links,
255 				 * or no text to give hyperlink style,
256 				 * so put them in a comment */
257 				GSList *l;
258 
259 				for (l = hrefs; l != NULL; l = l->next) {
260 					htmlNodeDump (a_buf, doc,
261 						      (htmlNodePtr)l->data);
262 					xmlBufferAdd (a_buf, CC2XML ("\n"),
263 						      -1);
264 				}
265 			}
266 			g_slist_free (hrefs);
267 			if (buf->len > 0) {
268 				GnmCell *cell = sheet_cell_fetch (tc->sheet, col + 1, tc->row);
269 				sheet_style_set_pos (tc->sheet, col + 1, tc->row, mstyle);
270 				gnm_cell_set_text (cell, buf->str);
271 			} else
272 				gnm_style_unref (mstyle);
273 
274 			if (a_buf->use > 0) {
275 				char *name;
276 
277 				name = g_strndup (CXML2C (a_buf->content), a_buf->use);
278 				cell_set_comment (tc->sheet, &pos, NULL, name, NULL);
279 				g_free (name);
280 			}
281 			g_string_free (buf, TRUE);
282 			xmlBufferFree (a_buf);
283 
284 			/* If necessary create the merge */
285 			if (colspan > 1 || rowspan > 1) {
286 				GnmRange range;
287 				GnmRange *r = &range;
288 
289 				range_init (r, col + 1, tc->row, col + colspan, tc->row + rowspan - 1);
290 				gnm_sheet_merge_add (tc->sheet, r, FALSE, NULL);
291 			}
292 
293 			col += colspan;
294 		}
295 	}
296 }
297 
298 static void
html_read_rows(htmlNodePtr cur,htmlDocPtr doc,Workbook * wb,GnmHtmlTableCtxt * tc)299 html_read_rows (htmlNodePtr cur, htmlDocPtr doc, Workbook *wb,
300 		GnmHtmlTableCtxt *tc)
301 {
302 	htmlNodePtr ptr;
303 
304 	for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
305 		if (ptr->type != XML_ELEMENT_NODE)
306 			continue;
307 		if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
308 			tc->row++;
309 			if (tc->sheet == NULL)
310 				tc->sheet = html_get_sheet (NULL, wb);
311 			html_read_row (ptr, doc, tc);
312 		}
313 	}
314 }
315 
316 static void
html_read_table(htmlNodePtr cur,htmlDocPtr doc,WorkbookView * wb_view,GnmHtmlTableCtxt * tc)317 html_read_table (htmlNodePtr cur, htmlDocPtr doc, WorkbookView *wb_view,
318 		 GnmHtmlTableCtxt *tc)
319 {
320 	Workbook *wb;
321 	htmlNodePtr ptr, ptr2;
322 
323 	g_return_if_fail (cur != NULL);
324 	g_return_if_fail (wb_view != NULL);
325 
326 	wb = wb_view_get_workbook (wb_view);
327 	for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
328 		if (ptr->type != XML_ELEMENT_NODE)
329 			continue;
330 		if (xmlStrEqual (ptr->name, CC2XML ("caption"))) {
331 			xmlBufferPtr buf;
332 			buf = xmlBufferCreate ();
333 			for (ptr2 = ptr->children; ptr2 != NULL ; ptr2 = ptr2->next) {
334 				htmlNodeDump (buf, doc, ptr2);
335 			}
336 			if (buf->use > 0) {
337 				char *name;
338 				name = g_strndup (CXML2C (buf->content), buf->use);
339 				tc->sheet = html_get_sheet (name, wb);
340 				g_free (name);
341 			}
342 			xmlBufferFree (buf);
343 		} else if (xmlStrEqual (ptr->name, CC2XML ("thead")) ||
344 			   xmlStrEqual (ptr->name, CC2XML ("tfoot")) ||
345 			   xmlStrEqual (ptr->name, CC2XML ("tbody"))) {
346 			html_read_rows (ptr, doc, wb, tc);
347 		} else if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
348 			html_read_rows (cur, doc, wb, tc);
349 			break;
350 		}
351 	}
352 }
353 
354 /* Element types which imply that we are inside a table */
355 static char const *table_start_elt_types[] = {
356 	"caption",
357 	"col",
358 	"colgroup",
359 	"tbody",
360 	"tfoot",
361 	"thead",
362 	"tr",
363 	NULL
364 };
365 
366 /* Element types which imply that we are inside a row */
367 static char const *row_start_elt_types[] = {
368 	"td",
369 	"th",
370 	NULL
371 };
372 
373 /* Element types which occur inside tables and rows, but also outside */
374 static char const *cont_elt_types[] = {
375 	"del",
376 	"ins",
377 	NULL
378 };
379 
380 static gboolean
is_elt_type(htmlNodePtr ptr,char const ** types)381 is_elt_type (htmlNodePtr ptr, char const ** types)
382 {
383 	char const **p;
384 	gboolean ret = FALSE;
385 
386 	for (p = types; *p; p++)
387 		if (xmlStrEqual (ptr->name, CC2XML ((*p)))) {
388 			ret = TRUE;
389 			break;
390 		}
391 
392 	return ret;
393 }
394 
395 static gboolean
starts_inferred_table(htmlNodePtr ptr)396 starts_inferred_table (htmlNodePtr ptr)
397 {
398 	return ((ptr->type == XML_ELEMENT_NODE) &&
399 		is_elt_type (ptr, table_start_elt_types));
400 }
401 
402 static gboolean
ends_inferred_table(htmlNodePtr ptr)403 ends_inferred_table (htmlNodePtr ptr)
404 {
405 	return ((ptr->type == XML_ELEMENT_NODE) &&
406 		!(is_elt_type (ptr, table_start_elt_types) ||
407 		  is_elt_type (ptr, cont_elt_types)));
408 }
409 
410 static gboolean
starts_inferred_row(htmlNodePtr ptr)411 starts_inferred_row (htmlNodePtr ptr)
412 {
413 	return ((ptr->type == XML_ELEMENT_NODE) &&
414 		is_elt_type (ptr, row_start_elt_types));
415 }
416 
417 static gboolean
ends_inferred_row(htmlNodePtr ptr)418 ends_inferred_row (htmlNodePtr ptr)
419 {
420 	return ((ptr->type == XML_ELEMENT_NODE) &&
421 		!(is_elt_type (ptr, row_start_elt_types) ||
422 		  is_elt_type (ptr, cont_elt_types)));
423 }
424 
425 /*
426  * Handles incomplete html fragments as may occur on the clipboard,
427  * e.g. a <td> without <tr> and <table> in front of it.
428  */
429 static void
html_search_for_tables(htmlNodePtr cur,htmlDocPtr doc,WorkbookView * wb_view,GnmHtmlTableCtxt * tc)430 html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
431 			WorkbookView *wb_view, GnmHtmlTableCtxt *tc)
432 {
433 	htmlNodePtr ptr;
434 
435 	if (cur == NULL) {
436 		xmlGenericError(xmlGenericErrorContext,
437 				"htmlNodeDumpFormatOutput : node == NULL\n");
438 		return;
439 	}
440 
441 	if (cur->type != XML_ELEMENT_NODE)
442 		return;
443 
444 	if (xmlStrEqual (cur->name, CC2XML ("table"))) {
445 		html_read_table (cur, doc, wb_view, tc);
446 	} else if (starts_inferred_table (cur) || starts_inferred_row (cur)) {
447 		htmlNodePtr tnode = xmlNewNode (NULL, "table");
448 
449 		/* Link in a table node */
450 		xmlAddPrevSibling (cur, tnode);
451 		if (starts_inferred_row (cur)) {
452 			htmlNodePtr rnode = xmlNewNode (NULL, "tr");
453 
454 			/* Link in a row node */
455 			xmlAddChild (tnode, rnode);
456 			/* Make following elements children of the row node,
457 			 * until we meet one which isn't legal in a row. */
458 			while ((ptr = tnode->next) != NULL) {
459 				if (ends_inferred_row (ptr))
460 					break;
461 				xmlUnlinkNode (ptr);
462 				xmlAddChild (rnode, ptr);
463 			}
464 		}
465 		/* Make following elements children of the row node,
466 		 * until we meet one which isn't legal in a table. */
467 		while ((ptr = tnode->next) != NULL) {
468 			if (ends_inferred_table (ptr))
469 				break;
470 			xmlUnlinkNode (ptr);
471 			xmlAddChild (tnode, ptr);
472 		}
473 		html_read_table (tnode, doc, wb_view, tc);
474 	} else {
475 		for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
476 			html_search_for_tables (ptr, doc, wb_view, tc);
477 			/* ptr may now have been pushed down in the tree,
478 			 * if so, ptr->next is not the right pointer to
479 			 * follow */
480 			while (ptr->parent != cur)
481 				ptr = ptr->parent;
482 		}
483 	}
484 }
485 
486 void
html_file_open(G_GNUC_UNUSED GOFileOpener const * fo,GOIOContext * io_context,WorkbookView * wb_view,GsfInput * input)487 html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
488 		WorkbookView *wb_view, GsfInput *input)
489 {
490 	guint8 const *buf;
491 	gsf_off_t size;
492 	int len, bomlen;
493 	htmlParserCtxtPtr ctxt;
494 	htmlDocPtr doc = NULL;
495 	xmlCharEncoding enc;
496 	GnmHtmlTableCtxt tc;
497 
498 	g_return_if_fail (input != NULL);
499 
500 	if (gsf_input_seek (input, 0, G_SEEK_SET))
501 		return;
502 
503 	size = gsf_input_size (input);
504 	if (size >= 4) {
505 		size -= 4;
506 		buf = gsf_input_read (input, 4, NULL);
507 		if (buf != NULL) {
508 			enc = xmlDetectCharEncoding(buf, 4);
509 			switch (enc) {
510 #if LIBXML_VERSION < 20702
511 			/* Skip byte order mark */
512 			case XML_CHAR_ENCODING_UCS4BE:
513 			case XML_CHAR_ENCODING_UCS4LE:
514 			case XML_CHAR_ENCODING_UCS4_2143:
515 			case XML_CHAR_ENCODING_UCS4_3412:
516 				if (buf[0] == 0xFE || buf[1] == 0xFE || buf[2] == 0xFE || buf[3] == 0xFE)
517 					bomlen = 4;
518 				else
519 					bomlen = 0;
520 				break;
521 			case XML_CHAR_ENCODING_EBCDIC:
522 				if (buf[0] == 0xDD)
523 					bomlen = 4;
524 				else
525 					bomlen = 0;
526 				break;
527 			case XML_CHAR_ENCODING_UTF16BE:
528 			case XML_CHAR_ENCODING_UTF16LE:
529 				if (buf[0] == 0xFE || buf[1] == 0xFE)
530 					bomlen = 2;
531 				else
532 					bomlen = 0;
533 				break;
534 			case XML_CHAR_ENCODING_UTF8:
535 				if (buf[0] == 0xef)
536 					bomlen = 3;
537 				else
538 					bomlen = 0;
539 				break;
540 #endif
541 			case XML_CHAR_ENCODING_NONE:
542 				bomlen = 0;
543 				/* Try to detect unmarked UTF16LE
544 				   (Firefox Windows clipboard, drag data all platforms) */
545 				if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
546 				    buf[1] == 0 &&
547 				    (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
548 				    buf[3] == 0)
549 					enc =  XML_CHAR_ENCODING_UTF16LE;
550 				break;
551 			default:
552 				bomlen = 0;
553 			}
554 			ctxt = htmlCreatePushParserCtxt (
555 				NULL, NULL, (char const *)(buf + bomlen),
556 				4 - bomlen, gsf_input_name (input), enc);
557 
558 			for (; size > 0 ; size -= len) {
559 				len = MIN (4096, size);
560 				buf = gsf_input_read (input, len, NULL);
561 				if (buf == NULL)
562 					break;
563 				htmlParseChunk (
564 					ctxt, (char const *)buf, len, 0);
565 			}
566 
567 			htmlParseChunk (ctxt, (char const *)buf, 0, 1);
568 			doc = ctxt->myDoc;
569 			htmlFreeParserCtxt (ctxt);
570 		}
571 	}
572 
573 	if (doc != NULL) {
574 		xmlNodePtr ptr;
575 		tc.sheet = NULL;
576 		tc.row   = -1;
577 		tc.wb_view = wb_view;
578 		for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
579 			html_search_for_tables (ptr, doc, wb_view, &tc);
580 		xmlFreeDoc (doc);
581 	} else
582 		go_io_error_info_set (io_context,
583 			go_error_info_new_str (_("Unable to parse the html.")));
584 }
585 
586 /* Quick and dirty html probe. */
587 gboolean
html_file_probe(G_GNUC_UNUSED GOFileOpener const * fo,GsfInput * input,G_GNUC_UNUSED GOFileProbeLevel pl)588 html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input,
589 		 G_GNUC_UNUSED GOFileProbeLevel pl)
590 {
591 	gsf_off_t size = 200;
592 	guint8 const* buf = gsf_input_read (input, size, NULL);
593 	gchar *ulstr = NULL;
594 	GString *ustr;
595 	gboolean res = FALSE;
596 
597 	/* Avoid seeking in large streams - try to read, fall back if
598 	 * stream is too short.  (Actually, currently _size does not
599 	 * involve any syscalls -- MW).  */
600 	if (!buf) {
601 		size = gsf_input_size (input);
602 		buf = gsf_input_read (input, size, NULL);
603 		if (!buf)
604 			return res;
605 	}
606 
607 	if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
608 		ulstr = g_utf8_strdown (ustr->str, -1);
609 		g_string_free (ustr, TRUE);
610 	}
611 
612 	if (!ulstr)
613 		return res;
614 
615 	res = (strstr (ulstr, "<table") != NULL ||
616 	       strstr (ulstr, "<html") != NULL ||
617 	       strstr (ulstr, "<!doctype html") != NULL);
618 
619 	g_free (ulstr);
620 
621 	return res;
622 }
623