1 /*
2 * html_read.c
3 *
4 * Copyright (C) 1999, 2000 Rasca, Berlin
5 * EMail: thron@gmx.de
6 * Copyright (c) 2001 Andreas J. Guelzow
7 * EMail: aguelzow@taliesin.ca
8 * Copyright (c) 2002 Jody Goldberg
9 * EMail: jody@gnome.org
10 *
11 * Contributors :
12 * Almer S. Tigelaar <almer1@dds.nl>
13 * Andreas J. Guelzow <aguelzow@taliesin.ca>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
24 *
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, see <https://www.gnu.org/licenses/>.
27 */
28
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
31 #include <gnumeric.h>
32 #include <string.h>
33 #include "html.h"
34
35 #include <sheet-object-cell-comment.h>
36 #include <workbook-view.h>
37 #include <workbook.h>
38 #include <sheet.h>
39 #include <sheet-merge.h>
40 #include <sheet-style.h>
41 #include <style.h>
42 #include <style-color.h>
43 #include <hlink.h>
44 #include <cell.h>
45 #include <ranges.h>
46 #include <goffice/goffice.h>
47
48 #include <gsf/gsf-input.h>
49 #include <libxml/HTMLparser.h>
50 #include <libxml/HTMLtree.h>
51
52 #define CC2XML(s) ((xmlChar const *)(s))
53 #define C2XML(s) ((xmlChar *)(s))
54 #define CXML2C(s) ((char const *)(s))
55 #define XML2C(s) ((char *)(s))
56
57 typedef struct {
58 Sheet *sheet;
59 int row;
60 WorkbookView *wb_view;
61 } GnmHtmlTableCtxt;
62
63 static void html_read_table (htmlNodePtr cur, htmlDocPtr doc,
64 WorkbookView *wb_view,
65 GnmHtmlTableCtxt *tc);
66
67
68 static Sheet *
html_get_sheet(char const * name,Workbook * wb)69 html_get_sheet (char const *name, Workbook *wb)
70 {
71 Sheet *sheet = NULL;
72
73 if (name) {
74 sheet = workbook_sheet_by_name (wb, name);
75 if (sheet == NULL) {
76 sheet = sheet_new (wb, name, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
77 workbook_sheet_attach (wb, sheet);
78 }
79 } else
80 sheet = workbook_sheet_add (wb, -1, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
81 return sheet;
82 }
83
84 static void
html_append_text(GString * buf,const xmlChar * text)85 html_append_text (GString *buf, const xmlChar *text)
86 {
87 const xmlChar *p;
88
89 while (*text) {
90 while (g_unichar_isspace (g_utf8_get_char (text)))
91 text = g_utf8_next_char (text);
92 if (*text) {
93 for (p = text;
94 *p && !g_unichar_isspace (g_utf8_get_char (p));
95 p = g_utf8_next_char (p))
96 ;
97 if (buf->len > 0)
98 g_string_append_c (buf, ' ');
99 g_string_append_len (buf, text, p - text);
100 text = p;
101 }
102 }
103 }
104
105 static void
html_read_content(htmlNodePtr cur,GString * buf,GnmStyle * mstyle,xmlBufferPtr a_buf,GSList ** hrefs,gboolean first,htmlDocPtr doc,GnmHtmlTableCtxt * tc)106 html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
107 xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
108 htmlDocPtr doc, GnmHtmlTableCtxt *tc)
109 {
110 htmlNodePtr ptr;
111
112 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
113 if (ptr->type == XML_TEXT_NODE) {
114 if (g_utf8_validate (ptr->content, -1, NULL))
115 html_append_text (buf, ptr->content);
116 else
117 g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
118 } else if (ptr->type == XML_ELEMENT_NODE) {
119 if (first) {
120 if (xmlStrEqual (ptr->name, CC2XML ("i"))
121 || xmlStrEqual (ptr->name, CC2XML ("em")))
122 gnm_style_set_font_italic (mstyle, TRUE);
123 if (xmlStrEqual (ptr->name, CC2XML ("b")))
124 gnm_style_set_font_bold (mstyle, TRUE);
125 }
126 if (xmlStrEqual (ptr->name, CC2XML ("a"))) {
127 xmlAttrPtr props;
128 props = ptr->properties;
129 while (props) {
130 if (xmlStrEqual (props->name, CC2XML ("href")) && props->children) {
131 *hrefs = g_slist_prepend (
132 *hrefs, props->children);
133
134 }
135 props = props->next;
136 }
137 }
138 if (xmlStrEqual (ptr->name, CC2XML ("img"))) {
139 xmlAttrPtr props;
140 props = ptr->properties;
141 while (props) {
142 if (xmlStrEqual (props->name, CC2XML ("src")) && props->children) {
143 htmlNodeDump (a_buf, doc, props->children);
144 xmlBufferAdd (a_buf, CC2XML ("\n"), -1);
145 }
146 props = props->next;
147 }
148 }
149 if (xmlStrEqual (ptr->name, CC2XML ("table"))) {
150 Sheet *last_sheet = tc->sheet;
151 int last_row = tc->row;
152 tc->sheet = NULL;
153 tc->row = -1;
154 html_read_table (ptr, doc, tc->wb_view, tc);
155 if (tc->sheet) {
156 g_string_append_printf (buf, _("[see sheet %s]"), tc->sheet->name_quoted);
157 xmlBufferAdd (a_buf, CC2XML (_("The original html file is\n"
158 "using nested tables.")), -1);
159 }
160 tc->sheet = last_sheet;
161 tc->row = last_row;
162 } else
163 html_read_content
164 (ptr, buf, mstyle, a_buf, hrefs, first, doc, tc);
165 }
166 first = FALSE;
167 }
168 }
169
170 static void
html_read_row(htmlNodePtr cur,htmlDocPtr doc,GnmHtmlTableCtxt * tc)171 html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
172 {
173 htmlNodePtr ptr;
174 int col = -1;
175
176 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
177 if (xmlStrEqual (ptr->name, CC2XML ("td")) ||
178 xmlStrEqual (ptr->name, CC2XML ("th"))) {
179 GString *buf;
180 xmlBufferPtr a_buf;
181 xmlAttrPtr props;
182 int colspan = 1;
183 int rowspan = 1;
184 GnmCellPos pos;
185 GnmStyle *mstyle;
186 GSList *hrefs = NULL;
187 GnmHLink *lnk = NULL;
188
189 /* Check whether we need to skip merges from above */
190 pos.row = tc->row;
191 pos.col = col + 1;
192 while (gnm_sheet_merge_contains_pos (tc->sheet, &pos)) {
193 col++;
194 pos.col++;
195 }
196
197 /* Do we span across multiple rows or cols? */
198 props = ptr->properties;
199 while (props) {
200 if (xmlStrEqual (props->name, CC2XML ("colspan")) && props->children)
201 colspan = atoi (CXML2C (props->children->content));
202 if (xmlStrEqual (props->name, CC2XML ("rowspan")) && props->children)
203 rowspan = atoi (CXML2C (props->children->content));
204 props = props->next;
205 }
206 if (colspan < 1)
207 colspan = 1;
208 if (rowspan < 1)
209 rowspan = 1;
210
211 /* Let's figure out the content of the cell */
212 buf = g_string_new (NULL);
213 a_buf = xmlBufferCreate ();
214
215 mstyle = gnm_style_new_default ();
216 if (xmlStrEqual (ptr->name, CC2XML ("th")))
217 gnm_style_set_font_bold (mstyle, TRUE);
218
219 html_read_content (ptr, buf, mstyle, a_buf,
220 &hrefs, TRUE, doc, tc);
221
222
223 if (g_slist_length (hrefs) >= 1 &&
224 buf->len > 0) {
225 /* One hyperlink, and text to make it
226 * visible */
227 char *url;
228 xmlBufferPtr h_buf = xmlBufferCreate ();
229
230 hrefs = g_slist_reverse (hrefs);
231 htmlNodeDump (
232 h_buf, doc, (htmlNodePtr)hrefs->data);
233 url = g_strndup (
234 CXML2C (h_buf->content), h_buf->use);
235 if (strncmp (url, "mailto:",
236 strlen ("mailto:")) == 0)
237 lnk = gnm_hlink_new (
238 gnm_hlink_email_get_type (),
239 tc->sheet);
240 else
241 lnk = gnm_hlink_new (
242 gnm_hlink_url_get_type (),
243 tc->sheet);
244 gnm_hlink_set_target (lnk, url);
245 gnm_style_set_hlink (mstyle, lnk);
246 gnm_style_set_font_uline (mstyle,
247 UNDERLINE_SINGLE);
248 gnm_style_set_font_color (mstyle,
249 gnm_color_new_go (GO_COLOR_BLUE));
250 g_free (url);
251 xmlBufferFree (h_buf);
252 }
253 if (g_slist_length (hrefs) > 1 || buf->len <= 0) {
254 /* Multiple links,
255 * or no text to give hyperlink style,
256 * so put them in a comment */
257 GSList *l;
258
259 for (l = hrefs; l != NULL; l = l->next) {
260 htmlNodeDump (a_buf, doc,
261 (htmlNodePtr)l->data);
262 xmlBufferAdd (a_buf, CC2XML ("\n"),
263 -1);
264 }
265 }
266 g_slist_free (hrefs);
267 if (buf->len > 0) {
268 GnmCell *cell = sheet_cell_fetch (tc->sheet, col + 1, tc->row);
269 sheet_style_set_pos (tc->sheet, col + 1, tc->row, mstyle);
270 gnm_cell_set_text (cell, buf->str);
271 } else
272 gnm_style_unref (mstyle);
273
274 if (a_buf->use > 0) {
275 char *name;
276
277 name = g_strndup (CXML2C (a_buf->content), a_buf->use);
278 cell_set_comment (tc->sheet, &pos, NULL, name, NULL);
279 g_free (name);
280 }
281 g_string_free (buf, TRUE);
282 xmlBufferFree (a_buf);
283
284 /* If necessary create the merge */
285 if (colspan > 1 || rowspan > 1) {
286 GnmRange range;
287 GnmRange *r = ⦥
288
289 range_init (r, col + 1, tc->row, col + colspan, tc->row + rowspan - 1);
290 gnm_sheet_merge_add (tc->sheet, r, FALSE, NULL);
291 }
292
293 col += colspan;
294 }
295 }
296 }
297
298 static void
html_read_rows(htmlNodePtr cur,htmlDocPtr doc,Workbook * wb,GnmHtmlTableCtxt * tc)299 html_read_rows (htmlNodePtr cur, htmlDocPtr doc, Workbook *wb,
300 GnmHtmlTableCtxt *tc)
301 {
302 htmlNodePtr ptr;
303
304 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
305 if (ptr->type != XML_ELEMENT_NODE)
306 continue;
307 if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
308 tc->row++;
309 if (tc->sheet == NULL)
310 tc->sheet = html_get_sheet (NULL, wb);
311 html_read_row (ptr, doc, tc);
312 }
313 }
314 }
315
316 static void
html_read_table(htmlNodePtr cur,htmlDocPtr doc,WorkbookView * wb_view,GnmHtmlTableCtxt * tc)317 html_read_table (htmlNodePtr cur, htmlDocPtr doc, WorkbookView *wb_view,
318 GnmHtmlTableCtxt *tc)
319 {
320 Workbook *wb;
321 htmlNodePtr ptr, ptr2;
322
323 g_return_if_fail (cur != NULL);
324 g_return_if_fail (wb_view != NULL);
325
326 wb = wb_view_get_workbook (wb_view);
327 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
328 if (ptr->type != XML_ELEMENT_NODE)
329 continue;
330 if (xmlStrEqual (ptr->name, CC2XML ("caption"))) {
331 xmlBufferPtr buf;
332 buf = xmlBufferCreate ();
333 for (ptr2 = ptr->children; ptr2 != NULL ; ptr2 = ptr2->next) {
334 htmlNodeDump (buf, doc, ptr2);
335 }
336 if (buf->use > 0) {
337 char *name;
338 name = g_strndup (CXML2C (buf->content), buf->use);
339 tc->sheet = html_get_sheet (name, wb);
340 g_free (name);
341 }
342 xmlBufferFree (buf);
343 } else if (xmlStrEqual (ptr->name, CC2XML ("thead")) ||
344 xmlStrEqual (ptr->name, CC2XML ("tfoot")) ||
345 xmlStrEqual (ptr->name, CC2XML ("tbody"))) {
346 html_read_rows (ptr, doc, wb, tc);
347 } else if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
348 html_read_rows (cur, doc, wb, tc);
349 break;
350 }
351 }
352 }
353
354 /* Element types which imply that we are inside a table */
355 static char const *table_start_elt_types[] = {
356 "caption",
357 "col",
358 "colgroup",
359 "tbody",
360 "tfoot",
361 "thead",
362 "tr",
363 NULL
364 };
365
366 /* Element types which imply that we are inside a row */
367 static char const *row_start_elt_types[] = {
368 "td",
369 "th",
370 NULL
371 };
372
373 /* Element types which occur inside tables and rows, but also outside */
374 static char const *cont_elt_types[] = {
375 "del",
376 "ins",
377 NULL
378 };
379
380 static gboolean
is_elt_type(htmlNodePtr ptr,char const ** types)381 is_elt_type (htmlNodePtr ptr, char const ** types)
382 {
383 char const **p;
384 gboolean ret = FALSE;
385
386 for (p = types; *p; p++)
387 if (xmlStrEqual (ptr->name, CC2XML ((*p)))) {
388 ret = TRUE;
389 break;
390 }
391
392 return ret;
393 }
394
395 static gboolean
starts_inferred_table(htmlNodePtr ptr)396 starts_inferred_table (htmlNodePtr ptr)
397 {
398 return ((ptr->type == XML_ELEMENT_NODE) &&
399 is_elt_type (ptr, table_start_elt_types));
400 }
401
402 static gboolean
ends_inferred_table(htmlNodePtr ptr)403 ends_inferred_table (htmlNodePtr ptr)
404 {
405 return ((ptr->type == XML_ELEMENT_NODE) &&
406 !(is_elt_type (ptr, table_start_elt_types) ||
407 is_elt_type (ptr, cont_elt_types)));
408 }
409
410 static gboolean
starts_inferred_row(htmlNodePtr ptr)411 starts_inferred_row (htmlNodePtr ptr)
412 {
413 return ((ptr->type == XML_ELEMENT_NODE) &&
414 is_elt_type (ptr, row_start_elt_types));
415 }
416
417 static gboolean
ends_inferred_row(htmlNodePtr ptr)418 ends_inferred_row (htmlNodePtr ptr)
419 {
420 return ((ptr->type == XML_ELEMENT_NODE) &&
421 !(is_elt_type (ptr, row_start_elt_types) ||
422 is_elt_type (ptr, cont_elt_types)));
423 }
424
425 /*
426 * Handles incomplete html fragments as may occur on the clipboard,
427 * e.g. a <td> without <tr> and <table> in front of it.
428 */
429 static void
html_search_for_tables(htmlNodePtr cur,htmlDocPtr doc,WorkbookView * wb_view,GnmHtmlTableCtxt * tc)430 html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
431 WorkbookView *wb_view, GnmHtmlTableCtxt *tc)
432 {
433 htmlNodePtr ptr;
434
435 if (cur == NULL) {
436 xmlGenericError(xmlGenericErrorContext,
437 "htmlNodeDumpFormatOutput : node == NULL\n");
438 return;
439 }
440
441 if (cur->type != XML_ELEMENT_NODE)
442 return;
443
444 if (xmlStrEqual (cur->name, CC2XML ("table"))) {
445 html_read_table (cur, doc, wb_view, tc);
446 } else if (starts_inferred_table (cur) || starts_inferred_row (cur)) {
447 htmlNodePtr tnode = xmlNewNode (NULL, "table");
448
449 /* Link in a table node */
450 xmlAddPrevSibling (cur, tnode);
451 if (starts_inferred_row (cur)) {
452 htmlNodePtr rnode = xmlNewNode (NULL, "tr");
453
454 /* Link in a row node */
455 xmlAddChild (tnode, rnode);
456 /* Make following elements children of the row node,
457 * until we meet one which isn't legal in a row. */
458 while ((ptr = tnode->next) != NULL) {
459 if (ends_inferred_row (ptr))
460 break;
461 xmlUnlinkNode (ptr);
462 xmlAddChild (rnode, ptr);
463 }
464 }
465 /* Make following elements children of the row node,
466 * until we meet one which isn't legal in a table. */
467 while ((ptr = tnode->next) != NULL) {
468 if (ends_inferred_table (ptr))
469 break;
470 xmlUnlinkNode (ptr);
471 xmlAddChild (tnode, ptr);
472 }
473 html_read_table (tnode, doc, wb_view, tc);
474 } else {
475 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
476 html_search_for_tables (ptr, doc, wb_view, tc);
477 /* ptr may now have been pushed down in the tree,
478 * if so, ptr->next is not the right pointer to
479 * follow */
480 while (ptr->parent != cur)
481 ptr = ptr->parent;
482 }
483 }
484 }
485
486 void
html_file_open(G_GNUC_UNUSED GOFileOpener const * fo,GOIOContext * io_context,WorkbookView * wb_view,GsfInput * input)487 html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
488 WorkbookView *wb_view, GsfInput *input)
489 {
490 guint8 const *buf;
491 gsf_off_t size;
492 int len, bomlen;
493 htmlParserCtxtPtr ctxt;
494 htmlDocPtr doc = NULL;
495 xmlCharEncoding enc;
496 GnmHtmlTableCtxt tc;
497
498 g_return_if_fail (input != NULL);
499
500 if (gsf_input_seek (input, 0, G_SEEK_SET))
501 return;
502
503 size = gsf_input_size (input);
504 if (size >= 4) {
505 size -= 4;
506 buf = gsf_input_read (input, 4, NULL);
507 if (buf != NULL) {
508 enc = xmlDetectCharEncoding(buf, 4);
509 switch (enc) {
510 #if LIBXML_VERSION < 20702
511 /* Skip byte order mark */
512 case XML_CHAR_ENCODING_UCS4BE:
513 case XML_CHAR_ENCODING_UCS4LE:
514 case XML_CHAR_ENCODING_UCS4_2143:
515 case XML_CHAR_ENCODING_UCS4_3412:
516 if (buf[0] == 0xFE || buf[1] == 0xFE || buf[2] == 0xFE || buf[3] == 0xFE)
517 bomlen = 4;
518 else
519 bomlen = 0;
520 break;
521 case XML_CHAR_ENCODING_EBCDIC:
522 if (buf[0] == 0xDD)
523 bomlen = 4;
524 else
525 bomlen = 0;
526 break;
527 case XML_CHAR_ENCODING_UTF16BE:
528 case XML_CHAR_ENCODING_UTF16LE:
529 if (buf[0] == 0xFE || buf[1] == 0xFE)
530 bomlen = 2;
531 else
532 bomlen = 0;
533 break;
534 case XML_CHAR_ENCODING_UTF8:
535 if (buf[0] == 0xef)
536 bomlen = 3;
537 else
538 bomlen = 0;
539 break;
540 #endif
541 case XML_CHAR_ENCODING_NONE:
542 bomlen = 0;
543 /* Try to detect unmarked UTF16LE
544 (Firefox Windows clipboard, drag data all platforms) */
545 if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
546 buf[1] == 0 &&
547 (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
548 buf[3] == 0)
549 enc = XML_CHAR_ENCODING_UTF16LE;
550 break;
551 default:
552 bomlen = 0;
553 }
554 ctxt = htmlCreatePushParserCtxt (
555 NULL, NULL, (char const *)(buf + bomlen),
556 4 - bomlen, gsf_input_name (input), enc);
557
558 for (; size > 0 ; size -= len) {
559 len = MIN (4096, size);
560 buf = gsf_input_read (input, len, NULL);
561 if (buf == NULL)
562 break;
563 htmlParseChunk (
564 ctxt, (char const *)buf, len, 0);
565 }
566
567 htmlParseChunk (ctxt, (char const *)buf, 0, 1);
568 doc = ctxt->myDoc;
569 htmlFreeParserCtxt (ctxt);
570 }
571 }
572
573 if (doc != NULL) {
574 xmlNodePtr ptr;
575 tc.sheet = NULL;
576 tc.row = -1;
577 tc.wb_view = wb_view;
578 for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
579 html_search_for_tables (ptr, doc, wb_view, &tc);
580 xmlFreeDoc (doc);
581 } else
582 go_io_error_info_set (io_context,
583 go_error_info_new_str (_("Unable to parse the html.")));
584 }
585
586 /* Quick and dirty html probe. */
587 gboolean
html_file_probe(G_GNUC_UNUSED GOFileOpener const * fo,GsfInput * input,G_GNUC_UNUSED GOFileProbeLevel pl)588 html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input,
589 G_GNUC_UNUSED GOFileProbeLevel pl)
590 {
591 gsf_off_t size = 200;
592 guint8 const* buf = gsf_input_read (input, size, NULL);
593 gchar *ulstr = NULL;
594 GString *ustr;
595 gboolean res = FALSE;
596
597 /* Avoid seeking in large streams - try to read, fall back if
598 * stream is too short. (Actually, currently _size does not
599 * involve any syscalls -- MW). */
600 if (!buf) {
601 size = gsf_input_size (input);
602 buf = gsf_input_read (input, size, NULL);
603 if (!buf)
604 return res;
605 }
606
607 if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
608 ulstr = g_utf8_strdown (ustr->str, -1);
609 g_string_free (ustr, TRUE);
610 }
611
612 if (!ulstr)
613 return res;
614
615 res = (strstr (ulstr, "<table") != NULL ||
616 strstr (ulstr, "<html") != NULL ||
617 strstr (ulstr, "<!doctype html") != NULL);
618
619 g_free (ulstr);
620
621 return res;
622 }
623