1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * Copyright (C) 1997 Martin Jones (mjones@kde.org)
4 * (C) 1997 Torben Weis (weis@kde.org)
5 * (C) 1999 Anders Carlsson (andersca@gnu.org)
6 * (C) 2000 Helix Code, Inc., Radek Doulik (rodo@helixcode.com)
7 * (C) 2001 Ximian, Inc.
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Library General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Library General Public License for more details.
18 *
19 * You should have received a copy of the GNU Library General Public License
20 * along with this library; see the file COPYING.LIB. If not, write to
21 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 * Boston, MA 02110-1301, USA.
23 */
24
25 /* The HTML Tokenizer */
26 #include <config.h>
27 #include <ctype.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include "htmltokenizer.h"
31 #include "htmlentity.h"
32
33 enum {
34 HTML_TOKENIZER_BEGIN_SIGNAL,
35 HTML_TOKENIZER_END_SIGNAL,
36 HTML_TOKENIZER_CHANGECONTENT_SIGNAL,
37 HTML_TOKENIZER_CHANGEENGINE_SIGNAL,
38 HTML_TOKENIZER_LAST_SIGNAL
39 };
40
41 static guint html_tokenizer_signals[HTML_TOKENIZER_LAST_SIGNAL] = { 0 };
42
43 #define TOKEN_BUFFER_SIZE (1 << 10)
44
45 #define dt(x)
46
47 typedef struct _HTMLBlockingToken HTMLBlockingToken;
48 typedef struct _HTMLTokenBuffer HTMLTokenBuffer;
49 typedef enum { Table } HTMLTokenType;
50
51 struct _HTMLTokenBuffer {
52 gint size;
53 gint used;
54 gchar * data;
55 };
56
57 struct _HTMLTokenizerPrivate {
58
59 /* token buffers list */
60 GList *token_buffers;
61
62 /* current read_buf position in list */
63 GList *read_cur;
64
65 /* current read buffer */
66 HTMLTokenBuffer * read_buf;
67 HTMLTokenBuffer * write_buf;
68
69 /* position in the read_buf */
70 gint read_pos;
71
72 /* non-blocking and blocking unreaded tokens in tokenizer */
73 gint tokens_num;
74 gint blocking_tokens_num;
75
76 gchar *dest;
77 gchar *buffer;
78 gint size;
79
80 gboolean skipLF; /* Skip the LF par of a CRLF sequence */
81
82 gboolean tag; /* Are we in an html tag? */
83 gboolean tquote; /* Are we in quotes in an html tag? */
84 gboolean startTag;
85 gboolean comment; /* Are we in a comment block? */
86 gboolean title; /* Are we in a <title> block? */
87 gboolean style; /* Are we in a <style> block? */
88 gboolean script; /* Are we in a <script> block? */
89 gboolean textarea; /* Are we in a <textarea> block? */
90 gint pre; /* Are we in a <pre> block? */
91 gboolean select; /* Are we in a <select> block? */
92 gboolean extension; /* Are we in an <!-- +GtkHTML: sequence? */
93
94 enum {
95 NoneDiscard = 0,
96 SpaceDiscard,
97 LFDiscard
98 } discard;
99
100 enum {
101 NonePending = 0,
102 SpacePending,
103 LFPending,
104 TabPending
105 } pending;
106
107 gchar searchBuffer[20];
108 gint searchCount;
109 gint searchGtkHTMLCount;
110 gint searchExtensionEndCount;
111
112 gchar *scriptCode;
113 gint scriptCodeSize;
114 gint scriptCodeMaxSize;
115
116 GList *blocking; /* Blocking tokens */
117
118 const gchar *searchFor;
119
120 gboolean enableconvert;
121
122 gchar * content_type;
123 /*convert*/
124 GIConv iconv_cd;
125
126 };
127
128 static const gchar *commentStart = "<!--";
129 static const gchar *scriptEnd = "</script>";
130 static const gchar *styleEnd = "</style>";
131 static const gchar *gtkhtmlStart = "+gtkhtml:";
132
133 enum quoteEnum {
134 NO_QUOTE = 0,
135 SINGLE_QUOTE,
136 DOUBLE_QUOTE
137 };
138
139 /* private tokenizer functions */
140 static void html_tokenizer_reset (HTMLTokenizer *t);
141 static void html_tokenizer_add_pending (HTMLTokenizer *t);
142 static void html_tokenizer_append_token (HTMLTokenizer *t,
143 const gchar *string,
144 gint len);
145 static void html_tokenizer_append_token_buffer (HTMLTokenizer *t,
146 gint min_size);
147
148 /* default implementations of tokenization functions */
149 static void html_tokenizer_finalize (GObject *);
150 static void html_tokenizer_real_change (HTMLTokenizer *, const gchar *content_type);
151 static void html_tokenizer_real_begin (HTMLTokenizer *, const gchar *content_type);
152 static void html_tokenizer_real_engine_type (HTMLTokenizer *t, gboolean engine_type);
153 static void html_tokenizer_real_write (HTMLTokenizer *, const gchar *str, gsize size);
154 static void html_tokenizer_real_end (HTMLTokenizer *);
155 static const gchar *
156 html_tokenizer_real_get_content_type (HTMLTokenizer *);
157 static gboolean
158 html_tokenizer_real_get_engine_type (HTMLTokenizer *);
159 static gchar *html_tokenizer_real_peek_token (HTMLTokenizer *);
160 static gchar *html_tokenizer_real_next_token (HTMLTokenizer *);
161 static gboolean html_tokenizer_real_has_more_tokens (HTMLTokenizer *);
162 static gchar *html_tokenizer_converted_token (HTMLTokenizer *t,const gchar * token);
163
164 static HTMLTokenizer *html_tokenizer_real_clone (HTMLTokenizer *);
165
166 /* blocking tokens */
167 static const gchar *html_tokenizer_blocking_get_name (HTMLTokenizer *t);
168 static void html_tokenizer_blocking_pop (HTMLTokenizer *t);
169 static void html_tokenizer_blocking_push (HTMLTokenizer *t,
170 HTMLTokenType tt);
171 static void html_tokenizer_tokenize_one_char (HTMLTokenizer *t,
172 const gchar **src);
173 static void add_char (HTMLTokenizer *t, gchar c);
174
175 gboolean is_need_convert (const gchar * token);
176
177 gchar * html_tokenizer_convert_entity (gchar * token);
178
179 static GObjectClass *parent_class = NULL;
180
181 static void
html_tokenizer_class_init(HTMLTokenizerClass * klass)182 html_tokenizer_class_init (HTMLTokenizerClass *klass)
183 {
184 GObjectClass *object_class = (GObjectClass *) klass;
185
186 parent_class = g_type_class_ref (G_TYPE_OBJECT);
187
188 html_tokenizer_signals[HTML_TOKENIZER_CHANGECONTENT_SIGNAL] =
189 g_signal_new ("change",
190 G_TYPE_FROM_CLASS (klass),
191 G_SIGNAL_RUN_LAST,
192 G_STRUCT_OFFSET (HTMLTokenizerClass, change),
193 NULL, NULL,
194 g_cclosure_marshal_VOID__POINTER,
195 G_TYPE_NONE,
196 1, G_TYPE_POINTER);
197
198 html_tokenizer_signals[HTML_TOKENIZER_CHANGEENGINE_SIGNAL] =
199 g_signal_new ("engine",
200 G_TYPE_FROM_CLASS (klass),
201 G_SIGNAL_RUN_LAST,
202 G_STRUCT_OFFSET (HTMLTokenizerClass, engine),
203 NULL, NULL,
204 g_cclosure_marshal_VOID__POINTER,
205 G_TYPE_NONE,
206 1, G_TYPE_POINTER);
207
208 html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL] =
209 g_signal_new ("begin",
210 G_TYPE_FROM_CLASS (klass),
211 G_SIGNAL_RUN_LAST,
212 G_STRUCT_OFFSET (HTMLTokenizerClass, begin),
213 NULL, NULL,
214 g_cclosure_marshal_VOID__POINTER,
215 G_TYPE_NONE,
216 1, G_TYPE_POINTER);
217
218 html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL] =
219 g_signal_new ("end",
220 G_TYPE_FROM_CLASS (klass),
221 G_SIGNAL_RUN_LAST,
222 G_STRUCT_OFFSET (HTMLTokenizerClass, end),
223 NULL, NULL,
224 g_cclosure_marshal_VOID__VOID,
225 G_TYPE_NONE,
226 0);
227
228 object_class->finalize = html_tokenizer_finalize;
229
230 klass->change = html_tokenizer_real_change;
231 klass->engine = html_tokenizer_real_engine_type;
232 klass->begin = html_tokenizer_real_begin;
233 klass->end = html_tokenizer_real_end;
234
235 klass->write = html_tokenizer_real_write;
236 klass->peek_token = html_tokenizer_real_peek_token;
237 klass->next_token = html_tokenizer_real_next_token;
238 klass->get_content_type = html_tokenizer_real_get_content_type;
239 klass->get_engine_type = html_tokenizer_real_get_engine_type;
240 klass->has_more = html_tokenizer_real_has_more_tokens;
241 klass->clone = html_tokenizer_real_clone;
242 }
243
244 static void
html_tokenizer_init(HTMLTokenizer * t)245 html_tokenizer_init (HTMLTokenizer *t)
246 {
247 struct _HTMLTokenizerPrivate *p;
248
249 t->priv = p = g_new0 (struct _HTMLTokenizerPrivate, 1);
250
251 p->token_buffers = NULL;
252 p->read_cur = NULL;
253 p->read_buf = NULL;
254 p->write_buf = NULL;
255 p->read_pos = 0;
256
257 p->dest = NULL;
258 p->buffer = NULL;
259 p->size = 0;
260
261 p->skipLF = FALSE;
262 p->tag = FALSE;
263 p->tquote = FALSE;
264 p->startTag = FALSE;
265 p->comment = FALSE;
266 p->title = FALSE;
267 p->style = FALSE;
268 p->script = FALSE;
269 p->textarea = FALSE;
270 p->pre = 0;
271 p->select = FALSE;
272 p->extension = FALSE;
273
274 p->discard = NoneDiscard;
275 p->pending = NonePending;
276
277 memset (p->searchBuffer, 0, sizeof (p->searchBuffer));
278 p->searchCount = 0;
279 p->searchGtkHTMLCount = 0;
280
281 p->scriptCode = NULL;
282 p->scriptCodeSize = 0;
283 p->scriptCodeMaxSize = 0;
284
285 p->blocking = NULL;
286
287 p->searchFor = NULL;
288
289 /* Use old logic and not convert charset */
290 p->enableconvert = FALSE;
291
292 p->content_type = g_strdup ("html/text; charset=utf-8");
293 }
294
295 static void
html_tokenizer_finalize(GObject * obj)296 html_tokenizer_finalize (GObject *obj)
297 {
298 HTMLTokenizer *t = HTML_TOKENIZER (obj);
299
300 html_tokenizer_reset (t);
301
302 if (is_valid_g_iconv (t->priv->iconv_cd))
303 g_iconv_close (t->priv->iconv_cd);
304
305 if (t->priv->content_type)
306 g_free (t->priv->content_type);
307
308 g_free (t->priv);
309 t->priv = NULL;
310
311 G_OBJECT_CLASS (parent_class)->finalize (obj);
312 }
313
314 GType
html_tokenizer_get_type(void)315 html_tokenizer_get_type (void)
316 {
317 static GType html_tokenizer_type = 0;
318
319 if (!html_tokenizer_type) {
320 static const GTypeInfo html_tokenizer_info = {
321 sizeof (HTMLTokenizerClass),
322 NULL,
323 NULL,
324 (GClassInitFunc) html_tokenizer_class_init,
325 NULL,
326 NULL,
327 sizeof (HTMLTokenizer),
328 1,
329 (GInstanceInitFunc) html_tokenizer_init,
330 };
331 html_tokenizer_type = g_type_register_static (G_TYPE_OBJECT, "HTMLTokenizer", &html_tokenizer_info, 0);
332 }
333
334 return html_tokenizer_type;
335 }
336
337 static HTMLTokenBuffer *
html_token_buffer_new(gint size)338 html_token_buffer_new (gint size)
339 {
340 HTMLTokenBuffer *nb = g_new (HTMLTokenBuffer, 1);
341
342 nb->data = g_new (gchar, size);
343 nb->size = size;
344 nb->used = 0;
345
346 return nb;
347 }
348
349 static void
html_token_buffer_destroy(HTMLTokenBuffer * tb)350 html_token_buffer_destroy (HTMLTokenBuffer *tb)
351 {
352 g_free (tb->data);
353 g_free (tb);
354 }
355
356 static gboolean
html_token_buffer_append_token(HTMLTokenBuffer * buf,const gchar * token,gint len)357 html_token_buffer_append_token (HTMLTokenBuffer *buf,
358 const gchar *token,
359 gint len)
360 {
361
362 /* check if we have enough free space */
363 if (len + 1 > buf->size - buf->used) {
364 return FALSE;
365 }
366
367 /* copy token and terminate with zero */
368 strncpy (buf->data + buf->used, token, len);
369 buf->used += len;
370 buf->data[buf->used] = 0;
371 buf->used++;
372
373 dt (printf ("html_token_buffer_append_token: '%s'\n", buf->data + buf->used - 1 - len));
374
375 return TRUE;
376 }
377
378 HTMLTokenizer *
html_tokenizer_new(void)379 html_tokenizer_new (void)
380 {
381 return (HTMLTokenizer *) g_object_new (HTML_TYPE_TOKENIZER, NULL);
382 }
383
384 void
html_tokenizer_destroy(HTMLTokenizer * t)385 html_tokenizer_destroy (HTMLTokenizer *t)
386 {
387 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
388
389 g_object_unref (G_OBJECT (t));
390 }
391
392 static gchar *
html_tokenizer_real_peek_token(HTMLTokenizer * t)393 html_tokenizer_real_peek_token (HTMLTokenizer *t)
394 {
395 struct _HTMLTokenizerPrivate *p = t->priv;
396 gchar *token;
397
398 g_assert (p->read_buf);
399
400 if (p->read_buf->used > p->read_pos) {
401 token = p->read_buf->data + p->read_pos;
402 } else {
403 GList *next;
404 HTMLTokenBuffer *buffer;
405
406 g_assert (p->read_cur);
407 g_assert (p->read_buf);
408
409 /* lookup for next buffer */
410 next = p->read_cur->next;
411 g_assert (next);
412
413 buffer = (HTMLTokenBuffer *) next->data;
414
415 g_return_val_if_fail (buffer->used != 0, NULL);
416
417 /* finally get first token */
418 token = buffer->data;
419 }
420
421 return html_tokenizer_converted_token (t,token);
422 }
423
424 /* test iconv for valid*/
425 gboolean
is_valid_g_iconv(const GIConv iconv_cd)426 is_valid_g_iconv (const GIConv iconv_cd)
427 {
428 return iconv_cd != NULL && iconv_cd != (GIConv) - 1;
429 }
430
431 /*Convert only chars when code >127*/
432 gboolean
is_need_convert(const gchar * token)433 is_need_convert (const gchar *token)
434 {
435 gint i = strlen (token);
436 for (; i >= 0; i--)
437 if (token[i]&128)
438 return TRUE;
439 return FALSE;
440 }
441
442 /*Convert entity values in already converted to right charset token*/
443 gchar *
html_tokenizer_convert_entity(gchar * token)444 html_tokenizer_convert_entity (gchar *token)
445 {
446 gchar *full_pos;
447 gchar *resulted;
448 gchar *write_pos;
449 gchar *read_pos;
450
451 if (token == NULL)
452 return NULL;
453
454 /*stop pointer*/
455 full_pos = token + strlen (token);
456 resulted = g_new (gchar, strlen (token) + 1);
457 write_pos = resulted;
458 read_pos = token;
459 while (read_pos < full_pos) {
460 gsize count_chars = strcspn (read_pos, "&");
461 memcpy (write_pos, read_pos, count_chars);
462 write_pos += count_chars;
463 read_pos += count_chars;
464 /*may be end string?*/
465 if (read_pos < full_pos)
466 if (*read_pos == '&') {
467 /*value to add*/
468 gunichar value = INVALID_ENTITY_CHARACTER_MARKER;
469 /*skip not needed &*/
470 read_pos++;
471 count_chars = strcspn (read_pos, ";");
472 if (count_chars < 14 && count_chars > 1) {
473 /*save for recovery*/
474 gchar save_gchar = *(read_pos + count_chars);
475 *(read_pos + count_chars)=0;
476 /* &#******; */
477 if (*read_pos == '#') {
478 /* � */
479 if (isdigit (*(read_pos + 1))) {
480 value = strtoull (read_pos + 1, NULL, 10);
481 /* Ý */
482 } else if (*(read_pos + 1) == 'x') {
483 value = strtoull (read_pos + 2, NULL, 16);
484 }
485 } else {
486 value = html_entity_parse (read_pos, strlen (read_pos));
487 }
488 if (*read_pos == '#' || value != INVALID_ENTITY_CHARACTER_MARKER) {
489 write_pos += g_unichar_to_utf8 (value, write_pos);
490 read_pos += (count_chars + 1);
491 } else {
492 /*recovery old value - it's not entity*/
493 write_pos += g_unichar_to_utf8 ('&', write_pos);
494 *(read_pos + count_chars) = save_gchar;
495 }
496 }
497 else
498 /*very large string*/
499 write_pos += g_unichar_to_utf8 ('&', write_pos);
500 }
501 }
502 *write_pos = 0;
503 free (token);
504
505 return resulted;
506 }
507
508 /*convert text to utf8 - allways alloc memmory*/
509 gchar *
convert_text_encoding(const GIConv iconv_cd,const gchar * token)510 convert_text_encoding (const GIConv iconv_cd,
511 const gchar *token)
512 {
513 gsize currlength;
514 gchar *newbuffer;
515 gchar *returnbuffer;
516 const gchar *current;
517 gsize newlength;
518 gsize oldlength;
519
520 if (token == NULL)
521 return NULL;
522
523 if (is_valid_g_iconv (iconv_cd) && is_need_convert (token)) {
524 currlength = strlen (token);
525 current = token;
526 newlength = currlength * 7 + 1;
527 oldlength = newlength;
528 newbuffer = g_new (gchar, newlength);
529 returnbuffer = newbuffer;
530
531 while (currlength > 0) {
532 /*function not change current, but g_iconv use not const source*/
533 g_iconv (iconv_cd, (gchar **) ¤t, &currlength, &newbuffer, &newlength);
534 if (currlength > 0) {
535 g_warning ("IconvError=%s", current);
536 *newbuffer = INVALID_ENTITY_CHARACTER_MARKER;
537 newbuffer++;
538 current++;
539 currlength--;
540 newlength--;
541 }
542 }
543 returnbuffer[oldlength - newlength] = '\0';
544 returnbuffer = g_realloc (returnbuffer, oldlength - newlength + 1);
545 return returnbuffer;
546 }
547 return g_strdup (token);
548 }
549
550 static gchar *
html_tokenizer_converted_token(HTMLTokenizer * t,const gchar * token)551 html_tokenizer_converted_token (HTMLTokenizer *t,
552 const gchar *token)
553 {
554 if (token != NULL) {
555 struct _HTMLTokenizerPrivate *p = t->priv;
556 return html_tokenizer_convert_entity (convert_text_encoding (p->iconv_cd, token));
557 }
558
559 return NULL;
560 }
561
562 static const gchar *
html_tokenizer_real_get_content_type(HTMLTokenizer * t)563 html_tokenizer_real_get_content_type (HTMLTokenizer *t)
564 {
565 struct _HTMLTokenizerPrivate *p = t->priv;
566
567 if (p->content_type)
568 return p->content_type;
569
570 return NULL;
571 }
572
573 static gboolean
html_tokenizer_real_get_engine_type(HTMLTokenizer * t)574 html_tokenizer_real_get_engine_type (HTMLTokenizer *t)
575 {
576 struct _HTMLTokenizerPrivate *p = t->priv;
577
578 return p->enableconvert;
579 }
580
581 static gchar *
html_tokenizer_real_next_token(HTMLTokenizer * t)582 html_tokenizer_real_next_token (HTMLTokenizer *t)
583 {
584 struct _HTMLTokenizerPrivate *p = t->priv;
585 gchar *token;
586
587 g_assert (p->read_buf);
588
589 /* token is in current read_buf */
590 if (p->read_buf->used > p->read_pos) {
591 token = p->read_buf->data + p->read_pos;
592 p->read_pos += strlen (token) + 1;
593 } else {
594 GList *new;
595
596 g_assert (p->read_cur);
597 g_assert (p->read_buf);
598
599 /* lookup for next buffer */
600 new = p->read_cur->next;
601 g_assert (new);
602
603 /* destroy current buffer */
604 p->token_buffers = g_list_remove (p->token_buffers, p->read_buf);
605 html_token_buffer_destroy (p->read_buf);
606
607 p->read_cur = new;
608 p->read_buf = (HTMLTokenBuffer *) new->data;
609
610 g_return_val_if_fail (p->read_buf->used != 0, NULL);
611
612 /* finally get first token */
613 token = p->read_buf->data;
614 p->read_pos = strlen (token) + 1;
615 }
616
617 p->tokens_num--;
618 g_assert (p->tokens_num >= 0);
619
620 return html_tokenizer_converted_token (t, token);
621 }
622
623 static gboolean
html_tokenizer_real_has_more_tokens(HTMLTokenizer * t)624 html_tokenizer_real_has_more_tokens (HTMLTokenizer *t)
625 {
626 return t->priv->tokens_num > 0;
627 }
628
629 static HTMLTokenizer *
html_tokenizer_real_clone(HTMLTokenizer * t)630 html_tokenizer_real_clone (HTMLTokenizer *t)
631 {
632 return html_tokenizer_new ();
633 }
634
635 static void
html_tokenizer_reset(HTMLTokenizer * t)636 html_tokenizer_reset (HTMLTokenizer *t)
637 {
638 struct _HTMLTokenizerPrivate *p = t->priv;
639 GList *cur = p->token_buffers;
640
641 /* free remaining token buffers */
642 while (cur) {
643 g_assert (cur->data);
644 html_token_buffer_destroy ((HTMLTokenBuffer *) cur->data);
645 cur = cur->next;
646 }
647
648 /* reset buffer list */
649 g_list_free (p->token_buffers);
650 p->token_buffers = p->read_cur = NULL;
651 p->read_buf = p->write_buf = NULL;
652 p->read_pos = 0;
653
654 /* reset token counters */
655 p->tokens_num = p->blocking_tokens_num = 0;
656
657 if (p->buffer)
658 g_free (p->buffer);
659 p->buffer = NULL;
660 p->dest = NULL;
661 p->size = 0;
662
663 if (p->scriptCode)
664 g_free (p->scriptCode);
665 p->scriptCode = NULL;
666 }
667
668 static gboolean
charset_is_utf8(const gchar * content_type)669 charset_is_utf8 (const gchar *content_type)
670 {
671 return content_type && strstr (content_type, "=utf-8") != NULL;
672 }
673
674 static gboolean
is_text(const gchar * content_type)675 is_text (const gchar *content_type)
676 {
677 return content_type && strstr (content_type, "text/") != NULL;
678 }
679
680 static const gchar *
get_encoding_from_content_type(const gchar * content_type)681 get_encoding_from_content_type (const gchar *content_type)
682 {
683 gchar * charset;
684 if (content_type)
685 {
686 charset = g_strrstr (content_type, "charset=");
687 if (charset != NULL)
688 return charset + strlen ("charset=");
689 charset = g_strrstr (content_type, "encoding=");
690 if (charset != NULL)
691 return charset + strlen ("encoding=");
692
693 }
694 return NULL;
695 }
696
697 GIConv
generate_iconv_from(const gchar * content_type)698 generate_iconv_from (const gchar *content_type)
699 {
700 if (content_type)
701 if (!charset_is_utf8 (content_type))
702 {
703 const gchar * encoding = get_encoding_from_content_type (content_type);
704 if (encoding)
705 return g_iconv_open ("utf-8", encoding);
706 }
707 return NULL;
708 }
709
710 GIConv
generate_iconv_to(const gchar * content_type)711 generate_iconv_to (const gchar *content_type)
712 {
713 if (content_type)
714 if (!charset_is_utf8 (content_type))
715 {
716 const gchar * encoding = get_encoding_from_content_type (content_type);
717 if (encoding)
718 return g_iconv_open (encoding, "utf-8");
719 }
720 return NULL;
721 }
722
723 static void
html_tokenizer_real_engine_type(HTMLTokenizer * t,gboolean engine_type)724 html_tokenizer_real_engine_type (HTMLTokenizer *t,
725 gboolean engine_type)
726 {
727 struct _HTMLTokenizerPrivate *p;
728 p = t->priv;
729
730 p->enableconvert = engine_type;
731 }
732
733 static void
html_tokenizer_real_change(HTMLTokenizer * t,const gchar * content_type)734 html_tokenizer_real_change (HTMLTokenizer *t,
735 const gchar *content_type)
736 {
737 struct _HTMLTokenizerPrivate *p;
738 if (!is_text (content_type))
739 return;
740
741 p = t->priv;
742
743 if (!p->enableconvert)
744 return;
745
746 if (p->content_type)
747 g_free (p->content_type);
748
749 p->content_type = g_ascii_strdown (content_type, -1);
750
751 if (is_valid_g_iconv (p->iconv_cd))
752 g_iconv_close (p->iconv_cd);
753
754 p->iconv_cd = generate_iconv_from (p->content_type);
755
756 #if 0
757 if (charset_is_utf8 (p->content_type))
758 g_warning ("Trying UTF-8");
759 else
760 g_warning ("Trying %s",p->content_type);
761 #endif
762 }
763
764 static void
html_tokenizer_real_begin(HTMLTokenizer * t,const gchar * content_type)765 html_tokenizer_real_begin (HTMLTokenizer *t,
766 const gchar *content_type)
767 {
768 struct _HTMLTokenizerPrivate *p = t->priv;
769
770 html_tokenizer_reset (t);
771
772 p->dest = p->buffer;
773 p->tag = FALSE;
774 p->pending = NonePending;
775 p->discard = NoneDiscard;
776 p->pre = 0;
777 p->script = FALSE;
778 p->style = FALSE;
779 p->skipLF = FALSE;
780 p->select = FALSE;
781 p->comment = FALSE;
782 p->textarea = FALSE;
783 p->startTag = FALSE;
784 p->extension = FALSE;
785 p->tquote = NO_QUOTE;
786 p->searchCount = 0;
787 p->searchGtkHTMLCount = 0;
788 p->title = FALSE;
789
790 html_tokenizer_real_change (t, content_type);
791 }
792
793 static void
destroy_blocking(gpointer data,gpointer user_data)794 destroy_blocking (gpointer data,
795 gpointer user_data)
796 {
797 g_free (data);
798 }
799
800 static void
html_tokenizer_real_end(HTMLTokenizer * t)801 html_tokenizer_real_end (HTMLTokenizer *t)
802 {
803 struct _HTMLTokenizerPrivate *p = t->priv;
804
805 if (p->buffer == 0)
806 return;
807
808 if (p->dest > p->buffer) {
809 html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
810 }
811
812 g_free (p->buffer);
813
814 p->buffer = NULL;
815 p->dest = NULL;
816 p->size = 0;
817
818 if (p->blocking) {
819 g_list_foreach (p->blocking, destroy_blocking, NULL);
820 p->tokens_num += p->blocking_tokens_num;
821 p->blocking_tokens_num = 0;
822 }
823 p->blocking = NULL;
824 }
825
826 static void
html_tokenizer_append_token(HTMLTokenizer * t,const gchar * string,gint len)827 html_tokenizer_append_token (HTMLTokenizer *t,
828 const gchar *string,
829 gint len)
830 {
831 struct _HTMLTokenizerPrivate *p = t->priv;
832
833 if (len < 1)
834 return;
835
836 /* allocate first buffer */
837 if (p->write_buf == NULL)
838 html_tokenizer_append_token_buffer (t, len);
839
840 /* try append token to current buffer, if not successful, create append new token buffer */
841 if (!html_token_buffer_append_token (p->write_buf, string, len)) {
842 html_tokenizer_append_token_buffer (t, len + 1);
843 /* now it must pass as we have enough space */
844 g_assert (html_token_buffer_append_token (p->write_buf, string, len));
845 }
846
847 if (p->blocking) {
848 p->blocking_tokens_num++;
849 } else {
850 p->tokens_num++;
851 }
852 }
853
854 static void
add_byte(HTMLTokenizer * t,const gchar ** c)855 add_byte (HTMLTokenizer *t,
856 const gchar **c)
857 {
858 add_char (t,**c);
859 (*c) ++;
860 }
861
862 static void
add_char(HTMLTokenizer * t,gchar c)863 add_char (HTMLTokenizer *t,
864 gchar c)
865 {
866 struct _HTMLTokenizerPrivate *p = t->priv;
867 if (c != '\0')
868 {
869 *(p->dest) = c;
870 p->dest++;
871 *(p->dest) = 0;
872 }
873 }
874
875 static void
html_tokenizer_append_token_buffer(HTMLTokenizer * t,gint min_size)876 html_tokenizer_append_token_buffer (HTMLTokenizer *t,
877 gint min_size)
878 {
879 struct _HTMLTokenizerPrivate *p = t->priv;
880 HTMLTokenBuffer *nb;
881 gint size = TOKEN_BUFFER_SIZE;
882
883 if (min_size > size)
884 size = min_size + (min_size >> 2);
885
886 /* create new buffer and add it to list */
887 nb = html_token_buffer_new (size);
888 p->token_buffers = g_list_append (p->token_buffers, nb);
889
890 /* this one is now write_buf */
891 p->write_buf = nb;
892
893 /* if we don't have read_buf already set it to this one */
894 if (p->read_buf == NULL) {
895 p->read_buf = nb;
896 p->read_cur = p->token_buffers;
897 }
898 }
899
900 /* EP CHECK: OK. */
901 static void
html_tokenizer_add_pending(HTMLTokenizer * t)902 html_tokenizer_add_pending (HTMLTokenizer *t)
903 {
904 struct _HTMLTokenizerPrivate *p = t->priv;
905
906 if (p->tag || p->select) {
907 add_char (t, ' ');
908 }
909 else if (p->textarea) {
910 if (p->pending == LFPending)
911 add_char (t, '\n');
912 else
913 add_char (t, ' ');
914 }
915 else if (p->pre) {
916 switch (p->pending) {
917 case SpacePending:
918 add_char (t, ' ');
919 break;
920 case LFPending:
921 if (p->dest > p->buffer) {
922 html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
923 }
924 p->dest = p->buffer;
925 add_char (t, TAG_ESCAPE);
926 add_char (t, '\n');
927 html_tokenizer_append_token (t, p->buffer, 2);
928 p->dest = p->buffer;
929 break;
930 case TabPending:
931 add_char (t, '\t');
932 break;
933 default:
934 g_warning ("Unknown pending type: %d\n", (gint) p->pending);
935 break;
936 }
937 }
938 else {
939 add_char (t, ' ');
940 }
941
942 p->pending = NonePending;
943 }
944
945 static void
prepare_enough_space(HTMLTokenizer * t)946 prepare_enough_space (HTMLTokenizer *t)
947 {
948 struct _HTMLTokenizerPrivate *p = t->priv;
949
950 if ((p->dest - p->buffer + 32) > p->size) {
951 guint off = p->dest - p->buffer;
952
953 p->size += (p->size >> 2) + 32;
954 p->buffer = g_realloc (p->buffer, p->size);
955 p->dest = p->buffer + off;
956 }
957 }
958
959 static void
in_comment(HTMLTokenizer * t,const gchar ** src)960 in_comment (HTMLTokenizer *t,
961 const gchar **src)
962 {
963 struct _HTMLTokenizerPrivate *p = t->priv;
964
965 if (**src == '-') { /* Look for "-->" */
966 if (p->searchCount < 2)
967 p->searchCount++;
968 } else if (p->searchCount == 2 && (**src == '>')) {
969 p->comment = FALSE; /* We've got a "-->" sequence */
970 } else if (tolower (**src) == gtkhtmlStart[p->searchGtkHTMLCount]) {
971 if (p->searchGtkHTMLCount == 8) {
972 p->extension = TRUE;
973 p->comment = FALSE;
974 p->searchCount = 0;
975 p->searchExtensionEndCount = 0;
976 p->searchGtkHTMLCount = 0;
977 } else
978 p->searchGtkHTMLCount++;
979 } else {
980 p->searchGtkHTMLCount = 0;
981 if (p->searchCount < 2)
982 p->searchCount = 0;
983 }
984
985 (*src)++;
986 }
987
988 static inline void
extension_one_char(HTMLTokenizer * t,const gchar ** src)989 extension_one_char (HTMLTokenizer *t,
990 const gchar **src)
991 {
992 struct _HTMLTokenizerPrivate *p = t->priv;
993
994 p->extension = FALSE;
995 html_tokenizer_tokenize_one_char (t, src);
996 p->extension = TRUE;
997 }
998
999 static void
in_extension(HTMLTokenizer * t,const gchar ** src)1000 in_extension (HTMLTokenizer *t,
1001 const gchar **src)
1002 {
1003 struct _HTMLTokenizerPrivate *p = t->priv;
1004
1005 /* check for "-->" */
1006 if (!p->tquote && **src == '-') {
1007 if (p->searchExtensionEndCount < 2)
1008 p->searchExtensionEndCount++;
1009 (*src) ++;
1010 } else if (!p->tquote && p->searchExtensionEndCount == 2 && **src == '>') {
1011 p->extension = FALSE;
1012 (*src) ++;
1013 } else {
1014 if (p->searchExtensionEndCount > 0) {
1015 if (p->extension) {
1016 const gchar *c = "-->";
1017
1018 while (p->searchExtensionEndCount) {
1019 extension_one_char (t, &c);
1020 p->searchExtensionEndCount--;
1021 }
1022 }
1023 }
1024 extension_one_char (t, src);
1025 }
1026 }
1027
1028 static void
in_script_or_style(HTMLTokenizer * t,const gchar ** src)1029 in_script_or_style (HTMLTokenizer *t,
1030 const gchar **src)
1031 {
1032 struct _HTMLTokenizerPrivate *p = t->priv;
1033
1034 /* Allocate memory to store the script or style */
1035 if (p->scriptCodeSize + 11 > p->scriptCodeMaxSize)
1036 p->scriptCode = g_realloc (p->scriptCode, p->scriptCodeMaxSize += 1024);
1037
1038 if ((**src == '>') && (p->searchFor[p->searchCount] == '>')) {
1039 (*src)++;
1040 p->scriptCode[p->scriptCodeSize] = 0;
1041 p->scriptCode[p->scriptCodeSize + 1] = 0;
1042 if (p->script) {
1043 p->script = FALSE;
1044 }
1045 else {
1046 p->style = FALSE;
1047 }
1048 g_free (p->scriptCode);
1049 p->scriptCode = NULL;
1050 }
1051 /* Check if a </script> tag is on its way */
1052 else if (p->searchCount > 0) {
1053 gboolean put_to_script = FALSE;
1054 if (tolower (**src) == p->searchFor[p->searchCount]) {
1055 p->searchBuffer[p->searchCount] = **src;
1056 p->searchCount++;
1057 (*src)++;
1058 }
1059 else if (p->searchFor[p->searchCount] == '>') {
1060 /* There can be any number of white-space characters between
1061 * tag name and closing '>' so try to move through them, if possible */
1062
1063 const gchar **p = src;
1064 while (isspace (**p))
1065 (*p)++;
1066
1067 if (**p == '>')
1068 *src = *p;
1069 else
1070 put_to_script = TRUE;
1071 }
1072 else
1073 put_to_script = TRUE;
1074
1075 if (put_to_script) {
1076 gchar *c;
1077
1078 p->searchBuffer[p->searchCount] = 0;
1079 c = p->searchBuffer;
1080 while (*c)
1081 p->scriptCode[p->scriptCodeSize++] = *c++;
1082 p->scriptCode[p->scriptCodeSize] = **src; (*src)++;
1083 p->searchCount = 0;
1084 }
1085 }
1086 else if (**src == '<') {
1087 p->searchCount = 1;
1088 p->searchBuffer[0] = '<';
1089 (*src)++;
1090 }
1091 else {
1092 p->scriptCode[p->scriptCodeSize] = **src;
1093 (*src)++;
1094 }
1095 }
1096
1097 static void
in_tag(HTMLTokenizer * t,const gchar ** src)1098 in_tag (HTMLTokenizer *t,
1099 const gchar **src)
1100 {
1101 struct _HTMLTokenizerPrivate *p = t->priv;
1102
1103 p->startTag = FALSE;
1104 if (**src == '/') {
1105 if (p->pending == LFPending && !p->pre) {
1106 p->pending = NonePending;
1107 }
1108 }
1109 else if (((**src >= 'a') && (**src <= 'z'))
1110 || ((**src >= 'A') && (**src <= 'Z'))) {
1111 /* Start of a start tag */
1112 }
1113 else if (**src == '!') {
1114 /* <!-- comment --> */
1115 }
1116 else if (**src == '?') {
1117 /* <? meta ?> */
1118 }
1119 else {
1120 /* Invalid tag, just add it */
1121 if (p->pending)
1122 html_tokenizer_add_pending (t);
1123 add_char (t, '<');
1124 add_byte (t, src);
1125 return;
1126 }
1127
1128 if (p->pending)
1129 html_tokenizer_add_pending (t);
1130
1131 if (p->dest > p->buffer) {
1132 html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
1133 p->dest = p->buffer;
1134 }
1135 add_char (t, TAG_ESCAPE);
1136 add_char (t, '<');
1137 p->tag = TRUE;
1138 p->searchCount = 1; /* Look for <!-- to start comment */
1139 }
1140
1141 static void
start_tag(HTMLTokenizer * t,const gchar ** src)1142 start_tag (HTMLTokenizer *t,
1143 const gchar **src)
1144 {
1145 (*src)++;
1146 t->priv->startTag = TRUE;
1147 t->priv->discard = NoneDiscard;
1148 }
1149
1150 static void
end_tag(HTMLTokenizer * t,const gchar ** src)1151 end_tag (HTMLTokenizer *t,
1152 const gchar **src)
1153 {
1154 struct _HTMLTokenizerPrivate *p = t->priv;
1155 gchar *ptr;
1156
1157 p->searchCount = 0; /* Stop looking for <!-- sequence */
1158
1159 add_char (t, '>');
1160
1161 /* Make the tag lower case */
1162 ptr = p->buffer + 2;
1163 if (p->pre || *ptr == '/') {
1164 /* End tag */
1165 p->discard = NoneDiscard;
1166 }
1167 else {
1168 /* Start tag */
1169 /* Ignore CRLFs after a start tag */
1170 p->discard = LFDiscard;
1171 }
1172
1173 while (*ptr && *ptr !=' ') {
1174 *ptr = tolower (*ptr);
1175 ptr++;
1176 }
1177 html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
1178 p->dest = p->buffer;
1179
1180 p->tag = FALSE;
1181 p->pending = NonePending;
1182 (*src)++;
1183
1184 if (strncmp (p->buffer + 2, "pre", 3) == 0) {
1185 p->pre++;
1186 }
1187 else if (strncmp (p->buffer + 2, "/pre", 4) == 0) {
1188 p->pre--;
1189 }
1190 else if (strncmp (p->buffer + 2, "textarea", 8) == 0) {
1191 p->textarea = TRUE;
1192 }
1193 else if (strncmp (p->buffer + 2, "/textarea", 9) == 0) {
1194 p->textarea = FALSE;
1195 }
1196 else if (strncmp (p->buffer + 2, "title", 5) == 0) {
1197 p->title = TRUE;
1198 }
1199 else if (strncmp (p->buffer + 2, "/title", 6) == 0) {
1200 p->title = FALSE;
1201 }
1202 else if (strncmp (p->buffer + 2, "script", 6) == 0) {
1203 p->script = TRUE;
1204 p->searchCount = 0;
1205 p->searchFor = scriptEnd;
1206 p->scriptCode = g_malloc (1024);
1207 p->scriptCodeSize = 0;
1208 p->scriptCodeMaxSize = 1024;
1209 }
1210 else if (strncmp (p->buffer + 2, "style", 5) == 0) {
1211 p->style = TRUE;
1212 p->searchCount = 0;
1213 p->searchFor = styleEnd;
1214 p->scriptCode = g_malloc (1024);
1215 p->scriptCodeSize = 0;
1216 p->scriptCodeMaxSize = 1024;
1217 }
1218 else if (strncmp (p->buffer + 2, "select", 6) == 0) {
1219 p->select = TRUE;
1220 }
1221 else if (strncmp (p->buffer + 2, "/select", 7) == 0) {
1222 p->select = FALSE;
1223 }
1224 else if (strncmp (p->buffer + 2, "tablesdkl", 9) == 0) {
1225 html_tokenizer_blocking_push (t, Table);
1226 }
1227 else {
1228 if (p->blocking) {
1229 const gchar *bn = html_tokenizer_blocking_get_name (t);
1230
1231 if (strncmp (p->buffer + 1, bn, strlen (bn)) == 0) {
1232 html_tokenizer_blocking_pop (t);
1233 }
1234 }
1235 }
1236 }
1237
1238 static void
in_crlf(HTMLTokenizer * t,const gchar ** src)1239 in_crlf (HTMLTokenizer *t,
1240 const gchar **src)
1241 {
1242 struct _HTMLTokenizerPrivate *p = t->priv;
1243
1244 if (p->tquote) {
1245 if (p->discard == NoneDiscard)
1246 p->pending = SpacePending;
1247 }
1248 else if (p->tag) {
1249 p->searchCount = 0; /* Stop looking for <!-- sequence */
1250 if (p->discard == NoneDiscard)
1251 p->pending = SpacePending; /* Treat LFs inside tags as spaces */
1252 }
1253 else if (p->pre || p->textarea) {
1254 if (p->discard == LFDiscard) {
1255 /* Ignore this LF */
1256 p->discard = NoneDiscard; /* We have discarded 1 LF */
1257 } else {
1258 /* Process this LF */
1259 if (p->pending)
1260 html_tokenizer_add_pending (t);
1261 p->pending = LFPending;
1262 }
1263 }
1264 else {
1265 if (p->discard == LFDiscard) {
1266 /* Ignore this LF */
1267 p->discard = NoneDiscard; /* We have discarded 1 LF */
1268 } else {
1269 /* Process this LF */
1270 if (p->pending == NonePending)
1271 p->pending = LFPending;
1272 }
1273 }
1274 /* Check for MS-DOS CRLF sequence */
1275 if (**src == '\r') {
1276 p->skipLF = TRUE;
1277 }
1278 (*src)++;
1279 }
1280
1281 static void
in_space_or_tab(HTMLTokenizer * t,const gchar ** src)1282 in_space_or_tab (HTMLTokenizer *t,
1283 const gchar **src)
1284 {
1285 if (t->priv->tquote) {
1286 if (t->priv->discard == NoneDiscard)
1287 t->priv->pending = SpacePending;
1288 }
1289 else if (t->priv->tag) {
1290 t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1291 if (t->priv->discard == NoneDiscard)
1292 t->priv->pending = SpacePending;
1293 }
1294 else if (t->priv->pre || t->priv->textarea) {
1295 if (t->priv->pending)
1296 html_tokenizer_add_pending (t);
1297 if (**src == ' ')
1298 t->priv->pending = SpacePending;
1299 else
1300 t->priv->pending = TabPending;
1301 }
1302 else {
1303 t->priv->pending = SpacePending;
1304 }
1305 (*src)++;
1306 }
1307
1308 static void
in_quoted(HTMLTokenizer * t,const gchar ** src)1309 in_quoted (HTMLTokenizer *t,
1310 const gchar **src)
1311 {
1312 /* We treat ' and " the same in tags " */
1313 t->priv->discard = NoneDiscard;
1314 if (t->priv->tag) {
1315 t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1316 if ((t->priv->tquote == SINGLE_QUOTE && **src == '\"') /* match " */
1317 || (t->priv->tquote == DOUBLE_QUOTE && **src == '\'')) {
1318 add_char (t, **src);
1319 (*src)++;
1320 } else if (*(t->priv->dest - 1) == '=' && !t->priv->tquote) {
1321 t->priv->discard = SpaceDiscard;
1322 t->priv->pending = NonePending;
1323
1324 if (**src == '\"') /* match " */
1325 t->priv->tquote = DOUBLE_QUOTE;
1326 else
1327 t->priv->tquote = SINGLE_QUOTE;
1328 add_char (t, **src);
1329 (*src)++;
1330 }
1331 else if (t->priv->tquote) {
1332 t->priv->tquote = NO_QUOTE;
1333 add_byte (t, src);
1334 t->priv->pending = SpacePending;
1335 }
1336 else {
1337 /* Ignore stray "\'" */
1338 (*src)++;
1339 }
1340 }
1341 else {
1342 if (t->priv->pending)
1343 html_tokenizer_add_pending (t);
1344
1345 add_byte (t, src);
1346 }
1347 }
1348
1349 static void
in_assignment(HTMLTokenizer * t,const gchar ** src)1350 in_assignment (HTMLTokenizer *t,
1351 const gchar **src)
1352 {
1353 t->priv->discard = NoneDiscard;
1354 if (t->priv->tag) {
1355 t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1356 add_char (t, '=');
1357 if (!t->priv->tquote) {
1358 t->priv->pending = NonePending;
1359 t->priv->discard = SpaceDiscard;
1360 }
1361 }
1362 else {
1363 if (t->priv->pending)
1364 html_tokenizer_add_pending (t);
1365
1366 add_char (t, '=');
1367 }
1368 (*src)++;
1369 }
1370
1371 inline static void
in_plain(HTMLTokenizer * t,const gchar ** src)1372 in_plain (HTMLTokenizer *t,
1373 const gchar **src)
1374 {
1375 struct _HTMLTokenizerPrivate *p = t->priv;
1376
1377 p->discard = NoneDiscard;
1378 if (p->pending)
1379 html_tokenizer_add_pending (t);
1380
1381 if (p->tag) {
1382 if (p->searchCount > 0) {
1383 if (**src == commentStart[p->searchCount]) {
1384 p->searchCount++;
1385 if (p->searchCount == 4) {
1386 /* Found <!-- sequence */
1387 p->comment = TRUE;
1388 p->dest = p->buffer;
1389 p->tag = FALSE;
1390 p->searchCount = 0;
1391 return;
1392 }
1393 }
1394 else {
1395 p->searchCount = 0; /* Stop lookinf for <!-- sequence */
1396 }
1397 }
1398 }
1399
1400 add_byte (t, src);
1401 }
1402
1403 static void
html_tokenizer_tokenize_one_char(HTMLTokenizer * t,const gchar ** src)1404 html_tokenizer_tokenize_one_char (HTMLTokenizer *t,
1405 const gchar **src)
1406 {
1407 struct _HTMLTokenizerPrivate *p = t->priv;
1408
1409 prepare_enough_space (t);
1410
1411 if (p->skipLF && **src != '\n')
1412 p->skipLF = FALSE;
1413
1414 if (p->skipLF)
1415 (*src) ++;
1416 else if (p->comment)
1417 in_comment (t, src);
1418 else if (p->extension)
1419 in_extension (t, src);
1420 else if (p->script || p->style)
1421 in_script_or_style (t, src);
1422 else if (p->startTag)
1423 in_tag (t, src);
1424 else if (**src == '<' && !p->tag)
1425 start_tag (t, src);
1426 else if (**src == '>' && p->tag && !p->tquote)
1427 end_tag (t, src);
1428 else if ((**src == '\n') || (**src == '\r'))
1429 in_crlf (t, src);
1430 else if ((**src == ' ') || (**src == '\t'))
1431 in_space_or_tab (t, src);
1432 else if (**src == '\"' || **src == '\'') /* match " ' */
1433 in_quoted (t, src);
1434 else if (**src == '=')
1435 in_assignment (t, src);
1436 else
1437 in_plain (t, src);
1438 }
1439
1440 static void
html_tokenizer_real_write(HTMLTokenizer * t,const gchar * string,gsize size)1441 html_tokenizer_real_write (HTMLTokenizer *t,
1442 const gchar *string,
1443 gsize size)
1444 {
1445 const gchar *src = string;
1446
1447 while ((src - string) < size)
1448 html_tokenizer_tokenize_one_char (t, &src);
1449 }
1450
1451 static const gchar *
html_tokenizer_blocking_get_name(HTMLTokenizer * t)1452 html_tokenizer_blocking_get_name (HTMLTokenizer *t)
1453 {
1454 switch (GPOINTER_TO_INT (t->priv->blocking->data)) {
1455 case Table:
1456 return "</tabledkdk";
1457 }
1458
1459 return "";
1460 }
1461
1462 static void
html_tokenizer_blocking_push(HTMLTokenizer * t,HTMLTokenType tt)1463 html_tokenizer_blocking_push (HTMLTokenizer *t,
1464 HTMLTokenType tt)
1465 {
1466 struct _HTMLTokenizerPrivate *p = t->priv;
1467
1468 /* block tokenizer - we must block last token in buffers as it was already added */
1469 if (!p->blocking) {
1470 p->tokens_num--;
1471 p->blocking_tokens_num++;
1472 }
1473 p->blocking = g_list_prepend (p->blocking, GINT_TO_POINTER (tt));
1474 }
1475
1476 static void
html_tokenizer_blocking_pop(HTMLTokenizer * t)1477 html_tokenizer_blocking_pop (HTMLTokenizer *t)
1478 {
1479 struct _HTMLTokenizerPrivate *p = t->priv;
1480
1481 p->blocking = g_list_remove (p->blocking, p->blocking->data);
1482
1483 /* unblock tokenizer */
1484 if (!p->blocking) {
1485 p->tokens_num += p->blocking_tokens_num;
1486 p->blocking_tokens_num = 0;
1487 }
1488 }
1489
1490 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
1491
1492 void
html_tokenizer_begin(HTMLTokenizer * t,const gchar * content_type)1493 html_tokenizer_begin (HTMLTokenizer *t,
1494 const gchar *content_type)
1495 {
1496
1497 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1498
1499 g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL], 0, content_type);
1500 }
1501
1502 void
html_tokenizer_set_engine_type(HTMLTokenizer * t,gboolean engine_type)1503 html_tokenizer_set_engine_type (HTMLTokenizer *t,
1504 gboolean engine_type)
1505 {
1506 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1507
1508 g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_CHANGEENGINE_SIGNAL], 0, engine_type);
1509 }
1510
1511 void
html_tokenizer_change_content_type(HTMLTokenizer * t,const gchar * content_type)1512 html_tokenizer_change_content_type (HTMLTokenizer *t,const gchar *content_type)
1513 {
1514 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1515
1516 g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_CHANGECONTENT_SIGNAL], 0, content_type);
1517 }
1518
1519 void
html_tokenizer_end(HTMLTokenizer * t)1520 html_tokenizer_end (HTMLTokenizer *t)
1521 {
1522 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1523
1524 g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL], 0);
1525 }
1526
1527 void
html_tokenizer_write(HTMLTokenizer * t,const gchar * str,gsize size)1528 html_tokenizer_write (HTMLTokenizer *t,
1529 const gchar *str,
1530 gsize size)
1531 {
1532 HTMLTokenizerClass *klass;
1533
1534 g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1535 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1536
1537 if (klass->write)
1538 klass->write (t, str, size);
1539 else
1540 g_warning ("No write method defined.");
1541 }
1542
1543 gchar *
html_tokenizer_peek_token(HTMLTokenizer * t)1544 html_tokenizer_peek_token (HTMLTokenizer *t)
1545 {
1546 HTMLTokenizerClass *klass;
1547
1548 g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1549
1550 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1551
1552 if (klass->peek_token)
1553 return klass->peek_token (t);
1554
1555 g_warning ("No peek_token method defined.");
1556 return NULL;
1557
1558 }
1559
1560 const gchar *
html_tokenizer_get_content_type(HTMLTokenizer * t)1561 html_tokenizer_get_content_type (HTMLTokenizer *t)
1562 {
1563 HTMLTokenizerClass *klass;
1564
1565 g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1566
1567 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1568
1569 if (klass->get_content_type)
1570 return klass->get_content_type (t);
1571
1572 g_warning ("No get_content_type method defined.");
1573 return NULL;
1574
1575 }
1576
1577 gboolean
html_tokenizer_get_engine_type(HTMLTokenizer * t)1578 html_tokenizer_get_engine_type (HTMLTokenizer *t)
1579 {
1580 HTMLTokenizerClass *klass;
1581
1582 g_return_val_if_fail (t && HTML_IS_TOKENIZER (t),FALSE);
1583
1584 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1585
1586 if (klass->get_engine_type)
1587 return klass->get_engine_type (t);
1588
1589 g_warning ("No get_engine_type method defined.");
1590 return FALSE;
1591 }
1592
1593 gchar *
html_tokenizer_next_token(HTMLTokenizer * t)1594 html_tokenizer_next_token (HTMLTokenizer *t)
1595 {
1596 HTMLTokenizerClass *klass;
1597
1598 g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1599
1600 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1601
1602 if (klass->next_token)
1603 return klass->next_token (t);
1604
1605 g_warning ("No next_token method defined.");
1606 return NULL;
1607 }
1608
1609 gboolean
html_tokenizer_has_more_tokens(HTMLTokenizer * t)1610 html_tokenizer_has_more_tokens (HTMLTokenizer *t)
1611 {
1612 HTMLTokenizerClass *klass;
1613
1614 g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), FALSE);
1615
1616 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1617
1618 if (klass->has_more) {
1619 return klass->has_more (t);
1620 }
1621
1622 g_warning ("No has_more method defined.");
1623 return FALSE;
1624
1625 }
1626
1627 HTMLTokenizer *
html_tokenizer_clone(HTMLTokenizer * t)1628 html_tokenizer_clone (HTMLTokenizer *t)
1629 {
1630 HTMLTokenizerClass *klass;
1631
1632 if (t == NULL)
1633 return NULL;
1634 g_return_val_if_fail (HTML_IS_TOKENIZER (t), NULL);
1635
1636 klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1637
1638 if (klass->clone)
1639 return klass->clone (t);
1640
1641 g_warning ("No clone method defined.");
1642 return NULL;
1643 }
1644