1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  *  Copyright (C) 1997 Martin Jones (mjones@kde.org)
4  *            (C) 1997 Torben Weis (weis@kde.org)
5  *            (C) 1999 Anders Carlsson (andersca@gnu.org)
6  *            (C) 2000 Helix Code, Inc., Radek Doulik (rodo@helixcode.com)
7  *            (C) 2001 Ximian, Inc.
8  *
9  *  This library is free software; you can redistribute it and/or
10  *  modify it under the terms of the GNU Library General Public
11  *  License as published by the Free Software Foundation; either
12  *  version 2 of the License, or (at your option) any later version.
13  *
14  *  This library is distributed in the hope that it will be useful,
15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  *  Library General Public License for more details.
18  *
19  *  You should have received a copy of the GNU Library General Public License
20  *  along with this library; see the file COPYING.LIB.  If not, write to
21  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22  *  Boston, MA 02110-1301, USA.
23 */
24 
25 /* The HTML Tokenizer */
26 #include <config.h>
27 #include <ctype.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include "htmltokenizer.h"
31 #include "htmlentity.h"
32 
33 enum {
34 	HTML_TOKENIZER_BEGIN_SIGNAL,
35 	HTML_TOKENIZER_END_SIGNAL,
36 	HTML_TOKENIZER_CHANGECONTENT_SIGNAL,
37 	HTML_TOKENIZER_CHANGEENGINE_SIGNAL,
38 	HTML_TOKENIZER_LAST_SIGNAL
39 };
40 
41 static guint html_tokenizer_signals[HTML_TOKENIZER_LAST_SIGNAL] = { 0 };
42 
43 #define TOKEN_BUFFER_SIZE (1 << 10)
44 
45 #define dt(x)
46 
47 typedef struct _HTMLBlockingToken HTMLBlockingToken;
48 typedef struct _HTMLTokenBuffer   HTMLTokenBuffer;
49 typedef	enum { Table }            HTMLTokenType;
50 
51 struct _HTMLTokenBuffer {
52 	gint size;
53 	gint used;
54 	gchar * data;
55 };
56 
57 struct _HTMLTokenizerPrivate {
58 
59 	/* token buffers list */
60 	GList *token_buffers;
61 
62 	/* current read_buf position in list */
63 	GList *read_cur;
64 
65 	/* current read buffer */
66 	HTMLTokenBuffer * read_buf;
67 	HTMLTokenBuffer * write_buf;
68 
69 	/* position in the read_buf */
70 	gint read_pos;
71 
72 	/* non-blocking and blocking unreaded tokens in tokenizer */
73 	gint tokens_num;
74 	gint blocking_tokens_num;
75 
76 	gchar *dest;
77 	gchar *buffer;
78 	gint size;
79 
80 	gboolean skipLF; /* Skip the LF par of a CRLF sequence */
81 
82 	gboolean tag; /* Are we in an html tag? */
83 	gboolean tquote; /* Are we in quotes in an html tag? */
84 	gboolean startTag;
85 	gboolean comment; /* Are we in a comment block? */
86 	gboolean title; /* Are we in a <title> block? */
87 	gboolean style; /* Are we in a <style> block? */
88 	gboolean script; /* Are we in a <script> block? */
89 	gboolean textarea; /* Are we in a <textarea> block? */
90 	gint     pre; /* Are we in a <pre> block? */
91 	gboolean select; /* Are we in a <select> block? */
92 	gboolean extension; /* Are we in an <!-- +GtkHTML: sequence? */
93 
94 	enum {
95 		NoneDiscard = 0,
96 		SpaceDiscard,
97 		LFDiscard
98 	} discard;
99 
100 	enum {
101 		NonePending = 0,
102 		SpacePending,
103 		LFPending,
104 		TabPending
105 	} pending;
106 
107 	gchar searchBuffer[20];
108 	gint searchCount;
109 	gint searchGtkHTMLCount;
110 	gint searchExtensionEndCount;
111 
112 	gchar *scriptCode;
113 	gint scriptCodeSize;
114 	gint scriptCodeMaxSize;
115 
116 	GList *blocking; /* Blocking tokens */
117 
118 	const gchar *searchFor;
119 
120 	gboolean enableconvert;
121 
122 	gchar * content_type;
123 	/*convert*/
124 	GIConv iconv_cd;
125 
126 };
127 
128 static const gchar *commentStart = "<!--";
129 static const gchar *scriptEnd = "</script>";
130 static const gchar *styleEnd = "</style>";
131 static const gchar *gtkhtmlStart = "+gtkhtml:";
132 
133 enum quoteEnum {
134 	NO_QUOTE = 0,
135 	SINGLE_QUOTE,
136 	DOUBLE_QUOTE
137 };
138 
139 /* private tokenizer functions */
140 static void           html_tokenizer_reset        (HTMLTokenizer *t);
141 static void           html_tokenizer_add_pending  (HTMLTokenizer *t);
142 static void           html_tokenizer_append_token (HTMLTokenizer *t,
143 						   const gchar *string,
144 						   gint len);
145 static void           html_tokenizer_append_token_buffer (HTMLTokenizer *t,
146 							  gint min_size);
147 
148 /* default implementations of tokenization functions */
149 static void     html_tokenizer_finalize             (GObject *);
150 static void     html_tokenizer_real_change          (HTMLTokenizer *, const gchar *content_type);
151 static void     html_tokenizer_real_begin           (HTMLTokenizer *, const gchar *content_type);
152 static void     html_tokenizer_real_engine_type (HTMLTokenizer *t, gboolean engine_type);
153 static void     html_tokenizer_real_write           (HTMLTokenizer *, const gchar *str, gsize size);
154 static void     html_tokenizer_real_end             (HTMLTokenizer *);
155 static const gchar *
156 				html_tokenizer_real_get_content_type (HTMLTokenizer *);
157 static gboolean
158 				html_tokenizer_real_get_engine_type (HTMLTokenizer *);
159 static gchar   *html_tokenizer_real_peek_token      (HTMLTokenizer *);
160 static gchar   *html_tokenizer_real_next_token      (HTMLTokenizer *);
161 static gboolean html_tokenizer_real_has_more_tokens (HTMLTokenizer *);
162 static gchar   *html_tokenizer_converted_token (HTMLTokenizer *t,const gchar * token);
163 
164 static HTMLTokenizer *html_tokenizer_real_clone     (HTMLTokenizer *);
165 
166 /* blocking tokens */
167 static const gchar       *html_tokenizer_blocking_get_name   (HTMLTokenizer  *t);
168 static void               html_tokenizer_blocking_pop        (HTMLTokenizer  *t);
169 static void               html_tokenizer_blocking_push       (HTMLTokenizer  *t,
170 							      HTMLTokenType   tt);
171 static void               html_tokenizer_tokenize_one_char   (HTMLTokenizer  *t,
172 							      const gchar  **src);
173 static void				  add_char (HTMLTokenizer *t, gchar c);
174 
175 gboolean				  is_need_convert (const gchar * token);
176 
177 gchar *					  html_tokenizer_convert_entity (gchar * token);
178 
179 static GObjectClass *parent_class = NULL;
180 
181 static void
html_tokenizer_class_init(HTMLTokenizerClass * klass)182 html_tokenizer_class_init (HTMLTokenizerClass *klass)
183 {
184 	GObjectClass *object_class = (GObjectClass *) klass;
185 
186 	parent_class = g_type_class_ref (G_TYPE_OBJECT);
187 
188 	html_tokenizer_signals[HTML_TOKENIZER_CHANGECONTENT_SIGNAL] =
189 		g_signal_new ("change",
190 			      G_TYPE_FROM_CLASS (klass),
191 			      G_SIGNAL_RUN_LAST,
192 			      G_STRUCT_OFFSET (HTMLTokenizerClass, change),
193 			      NULL, NULL,
194 			      g_cclosure_marshal_VOID__POINTER,
195 			      G_TYPE_NONE,
196 			      1, G_TYPE_POINTER);
197 
198 	html_tokenizer_signals[HTML_TOKENIZER_CHANGEENGINE_SIGNAL] =
199 		g_signal_new ("engine",
200 			      G_TYPE_FROM_CLASS (klass),
201 			      G_SIGNAL_RUN_LAST,
202 			      G_STRUCT_OFFSET (HTMLTokenizerClass, engine),
203 			      NULL, NULL,
204 			      g_cclosure_marshal_VOID__POINTER,
205 			      G_TYPE_NONE,
206 			      1, G_TYPE_POINTER);
207 
208 	html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL] =
209 		g_signal_new ("begin",
210 			      G_TYPE_FROM_CLASS (klass),
211 			      G_SIGNAL_RUN_LAST,
212 			      G_STRUCT_OFFSET (HTMLTokenizerClass, begin),
213 			      NULL, NULL,
214 			      g_cclosure_marshal_VOID__POINTER,
215 			      G_TYPE_NONE,
216 			      1, G_TYPE_POINTER);
217 
218 	html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL] =
219 		g_signal_new ("end",
220 			      G_TYPE_FROM_CLASS (klass),
221 			      G_SIGNAL_RUN_LAST,
222 			      G_STRUCT_OFFSET (HTMLTokenizerClass, end),
223 			      NULL, NULL,
224 			      g_cclosure_marshal_VOID__VOID,
225 			      G_TYPE_NONE,
226 			      0);
227 
228 	object_class->finalize = html_tokenizer_finalize;
229 
230 	klass->change     = html_tokenizer_real_change;
231 	klass->engine     = html_tokenizer_real_engine_type;
232 	klass->begin      = html_tokenizer_real_begin;
233 	klass->end        = html_tokenizer_real_end;
234 
235 	klass->write      = html_tokenizer_real_write;
236 	klass->peek_token = html_tokenizer_real_peek_token;
237 	klass->next_token = html_tokenizer_real_next_token;
238 	klass->get_content_type = html_tokenizer_real_get_content_type;
239 	klass->get_engine_type = html_tokenizer_real_get_engine_type;
240 	klass->has_more   = html_tokenizer_real_has_more_tokens;
241 	klass->clone      = html_tokenizer_real_clone;
242 }
243 
244 static void
html_tokenizer_init(HTMLTokenizer * t)245 html_tokenizer_init (HTMLTokenizer *t)
246 {
247 	struct _HTMLTokenizerPrivate *p;
248 
249 	t->priv = p = g_new0 (struct _HTMLTokenizerPrivate, 1);
250 
251 	p->token_buffers = NULL;
252 	p->read_cur  = NULL;
253 	p->read_buf  = NULL;
254 	p->write_buf = NULL;
255 	p->read_pos  = 0;
256 
257 	p->dest = NULL;
258 	p->buffer = NULL;
259 	p->size = 0;
260 
261 	p->skipLF = FALSE;
262 	p->tag = FALSE;
263 	p->tquote = FALSE;
264 	p->startTag = FALSE;
265 	p->comment = FALSE;
266 	p->title = FALSE;
267 	p->style = FALSE;
268 	p->script = FALSE;
269 	p->textarea = FALSE;
270 	p->pre = 0;
271 	p->select = FALSE;
272 	p->extension = FALSE;
273 
274 	p->discard = NoneDiscard;
275 	p->pending = NonePending;
276 
277 	memset (p->searchBuffer, 0, sizeof (p->searchBuffer));
278 	p->searchCount = 0;
279 	p->searchGtkHTMLCount = 0;
280 
281 	p->scriptCode = NULL;
282 	p->scriptCodeSize = 0;
283 	p->scriptCodeMaxSize = 0;
284 
285 	p->blocking = NULL;
286 
287 	p->searchFor = NULL;
288 
289 	/* Use old logic and not convert charset */
290 	p->enableconvert = FALSE;
291 
292 	p->content_type = g_strdup ("html/text; charset=utf-8");
293 }
294 
295 static void
html_tokenizer_finalize(GObject * obj)296 html_tokenizer_finalize (GObject *obj)
297 {
298 	HTMLTokenizer *t = HTML_TOKENIZER (obj);
299 
300 	html_tokenizer_reset (t);
301 
302 	if (is_valid_g_iconv (t->priv->iconv_cd))
303 		g_iconv_close (t->priv->iconv_cd);
304 
305 	if (t->priv->content_type)
306 		g_free (t->priv->content_type);
307 
308 	g_free (t->priv);
309 	t->priv = NULL;
310 
311     G_OBJECT_CLASS (parent_class)->finalize (obj);
312 }
313 
314 GType
html_tokenizer_get_type(void)315 html_tokenizer_get_type (void)
316 {
317 	static GType html_tokenizer_type = 0;
318 
319 	if (!html_tokenizer_type) {
320 		static const GTypeInfo html_tokenizer_info = {
321 			sizeof (HTMLTokenizerClass),
322 			NULL,
323 			NULL,
324 			(GClassInitFunc) html_tokenizer_class_init,
325 			NULL,
326 			NULL,
327 			sizeof (HTMLTokenizer),
328 			1,
329 			(GInstanceInitFunc) html_tokenizer_init,
330 		};
331 		html_tokenizer_type = g_type_register_static (G_TYPE_OBJECT, "HTMLTokenizer", &html_tokenizer_info, 0);
332 	}
333 
334 	return html_tokenizer_type;
335 }
336 
337 static HTMLTokenBuffer *
html_token_buffer_new(gint size)338 html_token_buffer_new (gint size)
339 {
340 	HTMLTokenBuffer *nb = g_new (HTMLTokenBuffer, 1);
341 
342 	nb->data = g_new (gchar, size);
343 	nb->size = size;
344 	nb->used = 0;
345 
346 	return nb;
347 }
348 
349 static void
html_token_buffer_destroy(HTMLTokenBuffer * tb)350 html_token_buffer_destroy (HTMLTokenBuffer *tb)
351 {
352 	g_free (tb->data);
353 	g_free (tb);
354 }
355 
356 static gboolean
html_token_buffer_append_token(HTMLTokenBuffer * buf,const gchar * token,gint len)357 html_token_buffer_append_token (HTMLTokenBuffer *buf,
358                                 const gchar *token,
359                                 gint len)
360 {
361 
362 	/* check if we have enough free space */
363 	if (len + 1 > buf->size - buf->used) {
364 		return FALSE;
365 	}
366 
367 	/* copy token and terminate with zero */
368 	strncpy (buf->data + buf->used, token, len);
369 	buf->used += len;
370 	buf->data[buf->used] = 0;
371 	buf->used++;
372 
373 	dt (printf ("html_token_buffer_append_token: '%s'\n", buf->data + buf->used - 1 - len));
374 
375 	return TRUE;
376 }
377 
378 HTMLTokenizer *
html_tokenizer_new(void)379 html_tokenizer_new (void)
380 {
381 	return (HTMLTokenizer *) g_object_new (HTML_TYPE_TOKENIZER, NULL);
382 }
383 
384 void
html_tokenizer_destroy(HTMLTokenizer * t)385 html_tokenizer_destroy (HTMLTokenizer *t)
386 {
387 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
388 
389 	g_object_unref (G_OBJECT (t));
390 }
391 
392 static gchar *
html_tokenizer_real_peek_token(HTMLTokenizer * t)393 html_tokenizer_real_peek_token (HTMLTokenizer *t)
394 {
395 	struct _HTMLTokenizerPrivate *p = t->priv;
396 	gchar *token;
397 
398 	g_assert (p->read_buf);
399 
400 	if (p->read_buf->used > p->read_pos) {
401 		token = p->read_buf->data + p->read_pos;
402 	} else {
403 		GList *next;
404 		HTMLTokenBuffer *buffer;
405 
406 		g_assert (p->read_cur);
407 		g_assert (p->read_buf);
408 
409 		/* lookup for next buffer */
410 		next = p->read_cur->next;
411 		g_assert (next);
412 
413 		buffer = (HTMLTokenBuffer *) next->data;
414 
415 		g_return_val_if_fail (buffer->used != 0, NULL);
416 
417 		/* finally get first token */
418 		token = buffer->data;
419 	}
420 
421 	return html_tokenizer_converted_token (t,token);
422 }
423 
424 /* test iconv for valid*/
425 gboolean
is_valid_g_iconv(const GIConv iconv_cd)426 is_valid_g_iconv (const GIConv iconv_cd)
427 {
428 	return iconv_cd != NULL && iconv_cd != (GIConv) - 1;
429 }
430 
431 /*Convert only chars when code >127*/
432 gboolean
is_need_convert(const gchar * token)433 is_need_convert (const gchar *token)
434 {
435 	gint i = strlen (token);
436 	for (; i >= 0; i--)
437 		if (token[i]&128)
438 			return TRUE;
439 	return FALSE;
440 }
441 
442 /*Convert entity values in already converted to right charset token*/
443 gchar *
html_tokenizer_convert_entity(gchar * token)444 html_tokenizer_convert_entity (gchar *token)
445 {
446 	gchar *full_pos;
447 	gchar *resulted;
448 	gchar *write_pos;
449 	gchar *read_pos;
450 
451 	if (token == NULL)
452 		return NULL;
453 
454 	/*stop pointer*/
455 	full_pos = token + strlen (token);
456 	resulted = g_new (gchar, strlen (token) + 1);
457 	write_pos = resulted;
458 	read_pos = token;
459 	while (read_pos < full_pos) {
460 		gsize count_chars = strcspn (read_pos, "&");
461 		memcpy (write_pos, read_pos, count_chars);
462 		write_pos += count_chars;
463 		read_pos += count_chars;
464 		/*may be end string?*/
465 		if (read_pos < full_pos)
466 			if (*read_pos == '&') {
467 				/*value to add*/
468 				gunichar value = INVALID_ENTITY_CHARACTER_MARKER;
469 				/*skip not needed &*/
470 				read_pos++;
471 				count_chars = strcspn (read_pos, ";");
472 				if (count_chars < 14 && count_chars > 1) {
473 					/*save for recovery*/
474 					gchar save_gchar = *(read_pos + count_chars);
475 					*(read_pos + count_chars)=0;
476 					/* &#******; */
477 					if (*read_pos == '#') {
478 						/* &#1234567 */
479 						if (isdigit (*(read_pos + 1))) {
480 							value = strtoull (read_pos + 1, NULL, 10);
481 						/* &#xdd; */
482 						} else if (*(read_pos + 1) == 'x') {
483 							value = strtoull (read_pos + 2, NULL, 16);
484 						}
485 					} else {
486 						value = html_entity_parse (read_pos, strlen (read_pos));
487 					}
488 					if (*read_pos == '#' || value != INVALID_ENTITY_CHARACTER_MARKER) {
489 						write_pos += g_unichar_to_utf8 (value, write_pos);
490 						read_pos += (count_chars + 1);
491 					} else {
492 						/*recovery old value - it's not entity*/
493 						write_pos += g_unichar_to_utf8 ('&', write_pos);
494 						*(read_pos + count_chars) = save_gchar;
495 					}
496 				}
497 				else
498 					/*very large string*/
499 					write_pos += g_unichar_to_utf8 ('&', write_pos);
500 			}
501 	}
502 	*write_pos = 0;
503 	free (token);
504 
505 	return resulted;
506 }
507 
508 /*convert text to utf8 - allways alloc memmory*/
509 gchar *
convert_text_encoding(const GIConv iconv_cd,const gchar * token)510 convert_text_encoding (const GIConv iconv_cd,
511                        const gchar *token)
512 {
513 	gsize currlength;
514 	gchar *newbuffer;
515 	gchar *returnbuffer;
516 	const gchar *current;
517 	gsize newlength;
518 	gsize oldlength;
519 
520 	if (token == NULL)
521 		return NULL;
522 
523 	if (is_valid_g_iconv (iconv_cd) && is_need_convert (token)) {
524 		currlength = strlen (token);
525 		current = token;
526 		newlength = currlength * 7 + 1;
527 		oldlength = newlength;
528 		newbuffer = g_new (gchar, newlength);
529 		returnbuffer = newbuffer;
530 
531 		while (currlength > 0) {
532 			/*function not change current, but g_iconv use not const source*/
533 			g_iconv (iconv_cd, (gchar **) &current, &currlength, &newbuffer, &newlength);
534 			if (currlength > 0) {
535 				g_warning ("IconvError=%s", current);
536 				*newbuffer = INVALID_ENTITY_CHARACTER_MARKER;
537 				newbuffer++;
538 				current++;
539 				currlength--;
540 				newlength--;
541 			}
542 		}
543 		returnbuffer[oldlength - newlength] = '\0';
544 		returnbuffer = g_realloc (returnbuffer, oldlength - newlength + 1);
545 		return returnbuffer;
546 	}
547 	return g_strdup (token);
548 }
549 
550 static gchar *
html_tokenizer_converted_token(HTMLTokenizer * t,const gchar * token)551 html_tokenizer_converted_token (HTMLTokenizer *t,
552                                 const gchar *token)
553 {
554 	if (token != NULL) {
555 		struct _HTMLTokenizerPrivate *p = t->priv;
556 		return html_tokenizer_convert_entity (convert_text_encoding (p->iconv_cd, token));
557 	}
558 
559 	return NULL;
560 }
561 
562 static const gchar *
html_tokenizer_real_get_content_type(HTMLTokenizer * t)563 html_tokenizer_real_get_content_type (HTMLTokenizer *t)
564 {
565 	struct _HTMLTokenizerPrivate *p = t->priv;
566 
567 	if (p->content_type)
568 		return p->content_type;
569 
570 	return NULL;
571 }
572 
573 static gboolean
html_tokenizer_real_get_engine_type(HTMLTokenizer * t)574 html_tokenizer_real_get_engine_type (HTMLTokenizer *t)
575 {
576 	struct _HTMLTokenizerPrivate *p = t->priv;
577 
578 	return p->enableconvert;
579 }
580 
581 static gchar *
html_tokenizer_real_next_token(HTMLTokenizer * t)582 html_tokenizer_real_next_token (HTMLTokenizer *t)
583 {
584 	struct _HTMLTokenizerPrivate *p = t->priv;
585 	gchar *token;
586 
587 	g_assert (p->read_buf);
588 
589 	/* token is in current read_buf */
590 	if (p->read_buf->used > p->read_pos) {
591 		token = p->read_buf->data + p->read_pos;
592 		p->read_pos += strlen (token) + 1;
593 	} else {
594 		GList *new;
595 
596 		g_assert (p->read_cur);
597 		g_assert (p->read_buf);
598 
599 		/* lookup for next buffer */
600 		new = p->read_cur->next;
601 		g_assert (new);
602 
603 		/* destroy current buffer */
604 		p->token_buffers = g_list_remove (p->token_buffers, p->read_buf);
605 		html_token_buffer_destroy (p->read_buf);
606 
607 		p->read_cur = new;
608 		p->read_buf = (HTMLTokenBuffer *) new->data;
609 
610 		g_return_val_if_fail (p->read_buf->used != 0, NULL);
611 
612 		/* finally get first token */
613 		token = p->read_buf->data;
614 		p->read_pos = strlen (token) + 1;
615 	}
616 
617 	p->tokens_num--;
618 	g_assert (p->tokens_num >= 0);
619 
620 	return html_tokenizer_converted_token (t, token);
621 }
622 
623 static gboolean
html_tokenizer_real_has_more_tokens(HTMLTokenizer * t)624 html_tokenizer_real_has_more_tokens (HTMLTokenizer *t)
625 {
626 	return t->priv->tokens_num > 0;
627 }
628 
629 static HTMLTokenizer *
html_tokenizer_real_clone(HTMLTokenizer * t)630 html_tokenizer_real_clone (HTMLTokenizer *t)
631 {
632 	return html_tokenizer_new ();
633 }
634 
635 static void
html_tokenizer_reset(HTMLTokenizer * t)636 html_tokenizer_reset (HTMLTokenizer *t)
637 {
638 	struct _HTMLTokenizerPrivate *p = t->priv;
639 	GList *cur = p->token_buffers;
640 
641 	/* free remaining token buffers */
642 	while (cur) {
643 		g_assert (cur->data);
644 		html_token_buffer_destroy ((HTMLTokenBuffer *) cur->data);
645 		cur = cur->next;
646 	}
647 
648 	/* reset buffer list */
649 	g_list_free (p->token_buffers);
650 	p->token_buffers = p->read_cur = NULL;
651 	p->read_buf = p->write_buf = NULL;
652 	p->read_pos = 0;
653 
654 	/* reset token counters */
655 	p->tokens_num = p->blocking_tokens_num = 0;
656 
657 	if (p->buffer)
658 		g_free (p->buffer);
659 	p->buffer = NULL;
660 	p->dest = NULL;
661 	p->size = 0;
662 
663 	if (p->scriptCode)
664 		g_free (p->scriptCode);
665 	p->scriptCode = NULL;
666 }
667 
668 static gboolean
charset_is_utf8(const gchar * content_type)669 charset_is_utf8 (const gchar *content_type)
670 {
671 	return content_type && strstr (content_type, "=utf-8") != NULL;
672 }
673 
674 static gboolean
is_text(const gchar * content_type)675 is_text (const gchar *content_type)
676 {
677 	return content_type && strstr (content_type, "text/") != NULL;
678 }
679 
680 static const gchar *
get_encoding_from_content_type(const gchar * content_type)681 get_encoding_from_content_type (const gchar *content_type)
682 {
683 	gchar * charset;
684 	if (content_type)
685 	{
686 		charset =  g_strrstr (content_type, "charset=");
687 		if (charset != NULL)
688 			return charset + strlen ("charset=");
689 		charset =  g_strrstr (content_type, "encoding=");
690 		if (charset != NULL)
691 			return charset + strlen ("encoding=");
692 
693 	}
694 	return NULL;
695 }
696 
697 GIConv
generate_iconv_from(const gchar * content_type)698 generate_iconv_from (const gchar *content_type)
699 {
700 	if (content_type)
701 		if (!charset_is_utf8 (content_type))
702 		{
703 			const gchar * encoding = get_encoding_from_content_type (content_type);
704 			if (encoding)
705 				return g_iconv_open ("utf-8", encoding);
706 		}
707 	return NULL;
708 }
709 
710 GIConv
generate_iconv_to(const gchar * content_type)711 generate_iconv_to (const gchar *content_type)
712 {
713 	if (content_type)
714 		if (!charset_is_utf8 (content_type))
715 		{
716 			const gchar * encoding = get_encoding_from_content_type (content_type);
717 			if (encoding)
718 				return g_iconv_open (encoding, "utf-8");
719 		}
720 	return NULL;
721 }
722 
723 static void
html_tokenizer_real_engine_type(HTMLTokenizer * t,gboolean engine_type)724 html_tokenizer_real_engine_type (HTMLTokenizer *t,
725                                  gboolean engine_type)
726 {
727 	struct _HTMLTokenizerPrivate *p;
728 	p = t->priv;
729 
730 	p->enableconvert = engine_type;
731 }
732 
733 static void
html_tokenizer_real_change(HTMLTokenizer * t,const gchar * content_type)734 html_tokenizer_real_change (HTMLTokenizer *t,
735                             const gchar *content_type)
736 {
737 	struct _HTMLTokenizerPrivate *p;
738 	if (!is_text (content_type))
739 		return;
740 
741 	p = t->priv;
742 
743 	if (!p->enableconvert)
744 		return;
745 
746 	if (p->content_type)
747 		g_free (p->content_type);
748 
749 	p->content_type = g_ascii_strdown (content_type, -1);
750 
751 	if (is_valid_g_iconv (p->iconv_cd))
752 		g_iconv_close (p->iconv_cd);
753 
754 	p->iconv_cd = generate_iconv_from (p->content_type);
755 
756 #if 0
757 	if (charset_is_utf8 (p->content_type))
758 		g_warning ("Trying UTF-8");
759 	else
760 		g_warning ("Trying %s",p->content_type);
761 #endif
762 }
763 
764 static void
html_tokenizer_real_begin(HTMLTokenizer * t,const gchar * content_type)765 html_tokenizer_real_begin (HTMLTokenizer *t,
766                            const gchar *content_type)
767 {
768 	struct _HTMLTokenizerPrivate *p = t->priv;
769 
770 	html_tokenizer_reset (t);
771 
772 	p->dest = p->buffer;
773 	p->tag = FALSE;
774 	p->pending = NonePending;
775 	p->discard = NoneDiscard;
776 	p->pre = 0;
777 	p->script = FALSE;
778 	p->style = FALSE;
779 	p->skipLF = FALSE;
780 	p->select = FALSE;
781 	p->comment = FALSE;
782 	p->textarea = FALSE;
783 	p->startTag = FALSE;
784 	p->extension = FALSE;
785 	p->tquote = NO_QUOTE;
786 	p->searchCount = 0;
787 	p->searchGtkHTMLCount = 0;
788 	p->title = FALSE;
789 
790 	html_tokenizer_real_change (t, content_type);
791 }
792 
793 static void
destroy_blocking(gpointer data,gpointer user_data)794 destroy_blocking (gpointer data,
795                   gpointer user_data)
796 {
797 	g_free (data);
798 }
799 
800 static void
html_tokenizer_real_end(HTMLTokenizer * t)801 html_tokenizer_real_end (HTMLTokenizer *t)
802 {
803 	struct _HTMLTokenizerPrivate *p = t->priv;
804 
805 	if (p->buffer == 0)
806 		return;
807 
808 	if (p->dest > p->buffer) {
809 		html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
810 	}
811 
812 	g_free (p->buffer);
813 
814 	p->buffer = NULL;
815 	p->dest = NULL;
816 	p->size = 0;
817 
818 	if (p->blocking) {
819 		g_list_foreach (p->blocking, destroy_blocking, NULL);
820 		p->tokens_num += p->blocking_tokens_num;
821 		p->blocking_tokens_num = 0;
822 	}
823 	p->blocking = NULL;
824 }
825 
826 static void
html_tokenizer_append_token(HTMLTokenizer * t,const gchar * string,gint len)827 html_tokenizer_append_token (HTMLTokenizer *t,
828                              const gchar *string,
829                              gint len)
830 {
831 	struct _HTMLTokenizerPrivate *p = t->priv;
832 
833 	if (len < 1)
834 		return;
835 
836 	/* allocate first buffer */
837 	if (p->write_buf == NULL)
838 		html_tokenizer_append_token_buffer (t, len);
839 
840 	/* try append token to current buffer, if not successful, create append new token buffer */
841 	if (!html_token_buffer_append_token (p->write_buf, string, len)) {
842 		html_tokenizer_append_token_buffer (t, len + 1);
843 		/* now it must pass as we have enough space */
844 		g_assert (html_token_buffer_append_token (p->write_buf, string, len));
845 	}
846 
847 	if (p->blocking) {
848 		p->blocking_tokens_num++;
849 	} else {
850 		p->tokens_num++;
851 	}
852 }
853 
854 static void
add_byte(HTMLTokenizer * t,const gchar ** c)855 add_byte (HTMLTokenizer *t,
856           const gchar **c)
857 {
858 	add_char (t,**c);
859 	(*c) ++;
860 }
861 
862 static void
add_char(HTMLTokenizer * t,gchar c)863 add_char (HTMLTokenizer *t,
864           gchar c)
865 {
866 	struct _HTMLTokenizerPrivate *p = t->priv;
867 	if (c != '\0')
868 	{
869 		*(p->dest) = c;
870 		p->dest++;
871 		*(p->dest) = 0;
872 	}
873 }
874 
875 static void
html_tokenizer_append_token_buffer(HTMLTokenizer * t,gint min_size)876 html_tokenizer_append_token_buffer (HTMLTokenizer *t,
877                                     gint min_size)
878 {
879 	struct _HTMLTokenizerPrivate *p = t->priv;
880 	HTMLTokenBuffer *nb;
881 	gint size = TOKEN_BUFFER_SIZE;
882 
883 	if (min_size > size)
884 		size = min_size + (min_size >> 2);
885 
886 	/* create new buffer and add it to list */
887 	nb = html_token_buffer_new (size);
888 	p->token_buffers = g_list_append (p->token_buffers, nb);
889 
890 	/* this one is now write_buf */
891 	p->write_buf = nb;
892 
893 	/* if we don't have read_buf already set it to this one */
894 	if (p->read_buf == NULL) {
895 		p->read_buf = nb;
896 		p->read_cur = p->token_buffers;
897 	}
898 }
899 
900 /* EP CHECK: OK.  */
901 static void
html_tokenizer_add_pending(HTMLTokenizer * t)902 html_tokenizer_add_pending (HTMLTokenizer *t)
903 {
904 	struct _HTMLTokenizerPrivate *p = t->priv;
905 
906 	if (p->tag || p->select) {
907 		add_char (t, ' ');
908 	}
909 	else if (p->textarea) {
910 		if (p->pending == LFPending)
911 			add_char (t, '\n');
912 		else
913 			add_char (t, ' ');
914 	}
915 	else if (p->pre) {
916 		switch (p->pending) {
917 		case SpacePending:
918 			add_char (t, ' ');
919 			break;
920 		case LFPending:
921 			if (p->dest > p->buffer) {
922 				html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
923 			}
924 			p->dest = p->buffer;
925 			add_char (t, TAG_ESCAPE);
926 			add_char (t, '\n');
927 			html_tokenizer_append_token (t, p->buffer, 2);
928 			p->dest = p->buffer;
929 			break;
930 		case TabPending:
931 			add_char (t, '\t');
932 			break;
933 		default:
934 			g_warning ("Unknown pending type: %d\n", (gint) p->pending);
935 			break;
936 		}
937 	}
938 	else {
939 		add_char (t, ' ');
940 	}
941 
942 	p->pending = NonePending;
943 }
944 
945 static void
prepare_enough_space(HTMLTokenizer * t)946 prepare_enough_space (HTMLTokenizer *t)
947 {
948 	struct _HTMLTokenizerPrivate *p = t->priv;
949 
950 	if ((p->dest - p->buffer + 32) > p->size) {
951 		guint off = p->dest - p->buffer;
952 
953 		p->size  += (p->size >> 2) + 32;
954 		p->buffer = g_realloc (p->buffer, p->size);
955 		p->dest   = p->buffer + off;
956 	}
957 }
958 
959 static void
in_comment(HTMLTokenizer * t,const gchar ** src)960 in_comment (HTMLTokenizer *t,
961             const gchar **src)
962 {
963 	struct _HTMLTokenizerPrivate *p = t->priv;
964 
965 	if (**src == '-') {		     /* Look for "-->" */
966 		if (p->searchCount < 2)
967 			p->searchCount++;
968 	} else if (p->searchCount == 2 && (**src == '>')) {
969 		p->comment = FALSE;          /* We've got a "-->" sequence */
970 	} else if (tolower (**src) == gtkhtmlStart[p->searchGtkHTMLCount]) {
971 		if (p->searchGtkHTMLCount == 8) {
972 			p->extension    = TRUE;
973 			p->comment = FALSE;
974 			p->searchCount = 0;
975 			p->searchExtensionEndCount = 0;
976 			p->searchGtkHTMLCount = 0;
977 		} else
978 			p->searchGtkHTMLCount++;
979 	} else {
980 		p->searchGtkHTMLCount = 0;
981 		if (p->searchCount < 2)
982 			p->searchCount = 0;
983 	}
984 
985 	(*src)++;
986 }
987 
988 static inline void
extension_one_char(HTMLTokenizer * t,const gchar ** src)989 extension_one_char (HTMLTokenizer *t,
990                     const gchar **src)
991 {
992 	struct _HTMLTokenizerPrivate *p = t->priv;
993 
994 	p->extension = FALSE;
995 	html_tokenizer_tokenize_one_char (t, src);
996 	p->extension = TRUE;
997 }
998 
999 static void
in_extension(HTMLTokenizer * t,const gchar ** src)1000 in_extension (HTMLTokenizer *t,
1001               const gchar **src)
1002 {
1003 	struct _HTMLTokenizerPrivate *p = t->priv;
1004 
1005 	/* check for "-->" */
1006 	if (!p->tquote && **src == '-') {
1007 		if (p->searchExtensionEndCount < 2)
1008 			p->searchExtensionEndCount++;
1009 		(*src) ++;
1010 	} else if (!p->tquote && p->searchExtensionEndCount == 2 && **src == '>') {
1011 		p->extension = FALSE;
1012 		(*src) ++;
1013 	} else {
1014 		if (p->searchExtensionEndCount > 0) {
1015 			if (p->extension) {
1016 				const gchar *c = "-->";
1017 
1018 				while (p->searchExtensionEndCount) {
1019 					extension_one_char (t, &c);
1020 					p->searchExtensionEndCount--;
1021 				}
1022 			}
1023 		}
1024 		extension_one_char (t, src);
1025 	}
1026 }
1027 
1028 static void
in_script_or_style(HTMLTokenizer * t,const gchar ** src)1029 in_script_or_style (HTMLTokenizer *t,
1030                     const gchar **src)
1031 {
1032 	struct _HTMLTokenizerPrivate *p = t->priv;
1033 
1034 	/* Allocate memory to store the script or style */
1035 	if (p->scriptCodeSize + 11 > p->scriptCodeMaxSize)
1036 		p->scriptCode = g_realloc (p->scriptCode, p->scriptCodeMaxSize += 1024);
1037 
1038 	if ((**src == '>') && (p->searchFor[p->searchCount] == '>')) {
1039 		(*src)++;
1040 		p->scriptCode[p->scriptCodeSize] = 0;
1041 		p->scriptCode[p->scriptCodeSize + 1] = 0;
1042 		if (p->script) {
1043 			p->script = FALSE;
1044 		}
1045 		else {
1046 			p->style = FALSE;
1047 		}
1048 		g_free (p->scriptCode);
1049 		p->scriptCode = NULL;
1050 	}
1051 	/* Check if a </script> tag is on its way */
1052 	else if (p->searchCount > 0) {
1053 		gboolean put_to_script = FALSE;
1054 		if (tolower (**src) == p->searchFor[p->searchCount]) {
1055 			p->searchBuffer[p->searchCount] = **src;
1056 			p->searchCount++;
1057 			(*src)++;
1058 		}
1059 		else if (p->searchFor[p->searchCount] == '>') {
1060 			/* There can be any number of white-space characters between
1061 			 * tag name and closing '>' so try to move through them, if possible */
1062 
1063 			const gchar **p = src;
1064 			while (isspace (**p))
1065 				(*p)++;
1066 
1067 			if (**p == '>')
1068 				*src = *p;
1069 			else
1070 				put_to_script = TRUE;
1071 		}
1072 		else
1073 			put_to_script = TRUE;
1074 
1075 		if (put_to_script) {
1076 			gchar *c;
1077 
1078 			p->searchBuffer[p->searchCount] = 0;
1079 			c = p->searchBuffer;
1080 			while (*c)
1081 				p->scriptCode[p->scriptCodeSize++] = *c++;
1082 			p->scriptCode[p->scriptCodeSize] = **src; (*src)++;
1083 			p->searchCount = 0;
1084 		}
1085 	}
1086 	else if (**src == '<') {
1087 		p->searchCount = 1;
1088 		p->searchBuffer[0] = '<';
1089 		(*src)++;
1090 	}
1091 	else {
1092 		p->scriptCode[p->scriptCodeSize] = **src;
1093 		(*src)++;
1094 	}
1095 }
1096 
1097 static void
in_tag(HTMLTokenizer * t,const gchar ** src)1098 in_tag (HTMLTokenizer *t,
1099         const gchar **src)
1100 {
1101 	struct _HTMLTokenizerPrivate *p = t->priv;
1102 
1103 	p->startTag = FALSE;
1104 	if (**src == '/') {
1105 		if (p->pending == LFPending  && !p->pre) {
1106 			p->pending = NonePending;
1107 		}
1108 	}
1109 	else if (((**src >= 'a') && (**src <= 'z'))
1110 		 || ((**src >= 'A') && (**src <= 'Z'))) {
1111 				/* Start of a start tag */
1112 	}
1113 	else if (**src == '!') {
1114 				/* <!-- comment --> */
1115 	}
1116 	else if (**src == '?') {
1117 				/* <? meta ?> */
1118 	}
1119 	else {
1120 				/* Invalid tag, just add it */
1121 		if (p->pending)
1122 			html_tokenizer_add_pending (t);
1123 		add_char (t, '<');
1124 		add_byte (t, src);
1125 		return;
1126 	}
1127 
1128 	if (p->pending)
1129 		html_tokenizer_add_pending (t);
1130 
1131 	if (p->dest > p->buffer) {
1132 		html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
1133 		p->dest = p->buffer;
1134 	}
1135 	add_char (t, TAG_ESCAPE);
1136 	add_char (t, '<');
1137 	p->tag = TRUE;
1138 	p->searchCount = 1; /* Look for <!-- to start comment */
1139 }
1140 
1141 static void
start_tag(HTMLTokenizer * t,const gchar ** src)1142 start_tag (HTMLTokenizer *t,
1143            const gchar **src)
1144 {
1145 	(*src)++;
1146 	t->priv->startTag = TRUE;
1147 	t->priv->discard  = NoneDiscard;
1148 }
1149 
1150 static void
end_tag(HTMLTokenizer * t,const gchar ** src)1151 end_tag (HTMLTokenizer *t,
1152          const gchar **src)
1153 {
1154 	struct _HTMLTokenizerPrivate *p = t->priv;
1155 	gchar *ptr;
1156 
1157 	p->searchCount = 0; /* Stop looking for <!-- sequence */
1158 
1159 	add_char (t, '>');
1160 
1161 	/* Make the tag lower case */
1162 	ptr = p->buffer + 2;
1163 	if (p->pre || *ptr == '/') {
1164 		/* End tag */
1165 		p->discard = NoneDiscard;
1166 	}
1167 	else {
1168 		/* Start tag */
1169 		/* Ignore CRLFs after a start tag */
1170 		p->discard = LFDiscard;
1171 	}
1172 
1173 	while (*ptr && *ptr !=' ') {
1174 		*ptr = tolower (*ptr);
1175 		ptr++;
1176 	}
1177 	html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer);
1178 	p->dest = p->buffer;
1179 
1180 	p->tag = FALSE;
1181 	p->pending = NonePending;
1182 	(*src)++;
1183 
1184 	if (strncmp (p->buffer + 2, "pre", 3) == 0) {
1185 		p->pre++;
1186 	}
1187 	else if (strncmp (p->buffer + 2, "/pre", 4) == 0) {
1188 		p->pre--;
1189 	}
1190 	else if (strncmp (p->buffer + 2, "textarea", 8) == 0) {
1191 		p->textarea = TRUE;
1192 	}
1193 	else if (strncmp (p->buffer + 2, "/textarea", 9) == 0) {
1194 		p->textarea = FALSE;
1195 	}
1196 	else if (strncmp (p->buffer + 2, "title", 5) == 0) {
1197 		p->title = TRUE;
1198 	}
1199 	else if (strncmp (p->buffer + 2, "/title", 6) == 0) {
1200 		p->title = FALSE;
1201 	}
1202 	else if (strncmp (p->buffer + 2, "script", 6) == 0) {
1203 		p->script = TRUE;
1204 		p->searchCount = 0;
1205 		p->searchFor = scriptEnd;
1206 		p->scriptCode = g_malloc (1024);
1207 		p->scriptCodeSize = 0;
1208 		p->scriptCodeMaxSize = 1024;
1209 	}
1210 	else if (strncmp (p->buffer + 2, "style", 5) == 0) {
1211 		p->style = TRUE;
1212 		p->searchCount = 0;
1213 		p->searchFor = styleEnd;
1214 		p->scriptCode = g_malloc (1024);
1215 		p->scriptCodeSize = 0;
1216 		p->scriptCodeMaxSize = 1024;
1217 	}
1218 	else if (strncmp (p->buffer + 2, "select", 6) == 0) {
1219 		p->select = TRUE;
1220 	}
1221 	else if (strncmp (p->buffer + 2, "/select", 7) == 0) {
1222 		p->select = FALSE;
1223 	}
1224 	else if (strncmp (p->buffer + 2, "tablesdkl", 9) == 0) {
1225 		html_tokenizer_blocking_push (t, Table);
1226 	}
1227 	else {
1228 		if (p->blocking) {
1229 			const gchar *bn = html_tokenizer_blocking_get_name (t);
1230 
1231 			if (strncmp (p->buffer + 1, bn, strlen (bn)) == 0) {
1232 				html_tokenizer_blocking_pop (t);
1233 			}
1234 		}
1235 	}
1236 }
1237 
1238 static void
in_crlf(HTMLTokenizer * t,const gchar ** src)1239 in_crlf (HTMLTokenizer *t,
1240          const gchar **src)
1241 {
1242 	struct _HTMLTokenizerPrivate *p = t->priv;
1243 
1244 	if (p->tquote) {
1245 		if (p->discard == NoneDiscard)
1246 			p->pending = SpacePending;
1247 	}
1248 	else if (p->tag) {
1249 		p->searchCount = 0; /* Stop looking for <!-- sequence */
1250 		if (p->discard == NoneDiscard)
1251 			p->pending = SpacePending; /* Treat LFs inside tags as spaces */
1252 	}
1253 	else if (p->pre || p->textarea) {
1254 		if (p->discard == LFDiscard) {
1255 			/* Ignore this LF */
1256 			p->discard = NoneDiscard; /*  We have discarded 1 LF */
1257 		} else {
1258 			/* Process this LF */
1259 			if (p->pending)
1260 				html_tokenizer_add_pending (t);
1261 			p->pending = LFPending;
1262 		}
1263 	}
1264 	else {
1265 		if (p->discard == LFDiscard) {
1266 			/* Ignore this LF */
1267 			p->discard = NoneDiscard; /* We have discarded 1 LF */
1268 		} else {
1269 			/* Process this LF */
1270 			if (p->pending == NonePending)
1271 				p->pending = LFPending;
1272 		}
1273 	}
1274 	/* Check for MS-DOS CRLF sequence */
1275 	if (**src == '\r') {
1276 		p->skipLF = TRUE;
1277 	}
1278 	(*src)++;
1279 }
1280 
1281 static void
in_space_or_tab(HTMLTokenizer * t,const gchar ** src)1282 in_space_or_tab (HTMLTokenizer *t,
1283                  const gchar **src)
1284 {
1285 	if (t->priv->tquote) {
1286 		if (t->priv->discard == NoneDiscard)
1287 			t->priv->pending = SpacePending;
1288 	}
1289 	else if (t->priv->tag) {
1290 		t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1291 		if (t->priv->discard == NoneDiscard)
1292 			t->priv->pending = SpacePending;
1293 	}
1294 	else if (t->priv->pre || t->priv->textarea) {
1295 		if (t->priv->pending)
1296 			html_tokenizer_add_pending (t);
1297 		if (**src == ' ')
1298 			t->priv->pending = SpacePending;
1299 		else
1300 			t->priv->pending = TabPending;
1301 	}
1302 	else {
1303 		t->priv->pending = SpacePending;
1304 	}
1305 	(*src)++;
1306 }
1307 
1308 static void
in_quoted(HTMLTokenizer * t,const gchar ** src)1309 in_quoted (HTMLTokenizer *t,
1310            const gchar **src)
1311 {
1312 	/* We treat ' and " the same in tags " */
1313 	t->priv->discard = NoneDiscard;
1314 	if (t->priv->tag) {
1315 		t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1316 		if ((t->priv->tquote == SINGLE_QUOTE && **src == '\"') /* match " */
1317 		    || (t->priv->tquote == DOUBLE_QUOTE && **src == '\'')) {
1318 			add_char (t, **src);
1319 			(*src)++;
1320 		} else if (*(t->priv->dest - 1) == '=' && !t->priv->tquote) {
1321 			t->priv->discard = SpaceDiscard;
1322 			t->priv->pending = NonePending;
1323 
1324 			if (**src == '\"') /* match " */
1325 				t->priv->tquote = DOUBLE_QUOTE;
1326 			else
1327 				t->priv->tquote = SINGLE_QUOTE;
1328 			add_char (t, **src);
1329 			(*src)++;
1330 		}
1331 		else if (t->priv->tquote) {
1332 			t->priv->tquote = NO_QUOTE;
1333 			add_byte (t, src);
1334 			t->priv->pending = SpacePending;
1335 		}
1336 		else {
1337 			/* Ignore stray "\'" */
1338 			(*src)++;
1339 		}
1340 	}
1341 	else {
1342 		if (t->priv->pending)
1343 			html_tokenizer_add_pending (t);
1344 
1345 		add_byte (t, src);
1346 	}
1347 }
1348 
1349 static void
in_assignment(HTMLTokenizer * t,const gchar ** src)1350 in_assignment (HTMLTokenizer *t,
1351                const gchar **src)
1352 {
1353 	t->priv->discard = NoneDiscard;
1354 	if (t->priv->tag) {
1355 		t->priv->searchCount = 0; /* Stop looking for <!-- sequence */
1356 		add_char (t, '=');
1357 		if (!t->priv->tquote) {
1358 			t->priv->pending = NonePending;
1359 			t->priv->discard = SpaceDiscard;
1360 		}
1361 	}
1362 	else {
1363 		if (t->priv->pending)
1364 			html_tokenizer_add_pending (t);
1365 
1366 		add_char (t, '=');
1367 	}
1368 	(*src)++;
1369 }
1370 
1371 inline static void
in_plain(HTMLTokenizer * t,const gchar ** src)1372 in_plain (HTMLTokenizer *t,
1373           const gchar **src)
1374 {
1375 	struct _HTMLTokenizerPrivate *p = t->priv;
1376 
1377 	p->discard = NoneDiscard;
1378 	if (p->pending)
1379 		html_tokenizer_add_pending (t);
1380 
1381 	if (p->tag) {
1382 		if (p->searchCount > 0) {
1383 			if (**src == commentStart[p->searchCount]) {
1384 				p->searchCount++;
1385 				if (p->searchCount == 4) {
1386 					/* Found <!-- sequence */
1387 					p->comment = TRUE;
1388 					p->dest = p->buffer;
1389 					p->tag = FALSE;
1390 					p->searchCount = 0;
1391 					return;
1392 				}
1393 			}
1394 			else {
1395 				p->searchCount = 0; /* Stop lookinf for <!-- sequence */
1396 			}
1397 		}
1398 	}
1399 
1400 	add_byte (t, src);
1401 }
1402 
1403 static void
html_tokenizer_tokenize_one_char(HTMLTokenizer * t,const gchar ** src)1404 html_tokenizer_tokenize_one_char (HTMLTokenizer *t,
1405                                   const gchar **src)
1406 {
1407 	struct _HTMLTokenizerPrivate *p = t->priv;
1408 
1409 	prepare_enough_space (t);
1410 
1411 	if (p->skipLF && **src != '\n')
1412 		p->skipLF = FALSE;
1413 
1414 	if (p->skipLF)
1415 		(*src) ++;
1416 	else if (p->comment)
1417 		in_comment (t, src);
1418 	else if (p->extension)
1419 		in_extension (t, src);
1420 	else if (p->script || p->style)
1421 		in_script_or_style (t, src);
1422 	else if (p->startTag)
1423 		in_tag (t, src);
1424 	else if (**src == '<' && !p->tag)
1425 		start_tag (t, src);
1426 	else if (**src == '>' && p->tag && !p->tquote)
1427 		end_tag (t, src);
1428 	else if ((**src == '\n') || (**src == '\r'))
1429 		in_crlf (t, src);
1430 	else if ((**src == ' ') || (**src == '\t'))
1431 		in_space_or_tab (t, src);
1432 	else if (**src == '\"' || **src == '\'') /* match " ' */
1433 		in_quoted (t, src);
1434 	else if (**src == '=')
1435 		in_assignment (t, src);
1436 	else
1437 		in_plain (t, src);
1438 }
1439 
1440 static void
html_tokenizer_real_write(HTMLTokenizer * t,const gchar * string,gsize size)1441 html_tokenizer_real_write (HTMLTokenizer *t,
1442                            const gchar *string,
1443                            gsize size)
1444 {
1445 	const gchar *src = string;
1446 
1447 	while ((src - string) < size)
1448 		html_tokenizer_tokenize_one_char (t, &src);
1449 }
1450 
1451 static const gchar *
html_tokenizer_blocking_get_name(HTMLTokenizer * t)1452 html_tokenizer_blocking_get_name (HTMLTokenizer *t)
1453 {
1454 	switch (GPOINTER_TO_INT (t->priv->blocking->data)) {
1455 	case Table:
1456 		return "</tabledkdk";
1457 	}
1458 
1459 	return "";
1460 }
1461 
1462 static void
html_tokenizer_blocking_push(HTMLTokenizer * t,HTMLTokenType tt)1463 html_tokenizer_blocking_push (HTMLTokenizer *t,
1464                               HTMLTokenType tt)
1465 {
1466 	struct _HTMLTokenizerPrivate *p = t->priv;
1467 
1468 	/* block tokenizer - we must block last token in buffers as it was already added */
1469 	if (!p->blocking) {
1470 		p->tokens_num--;
1471 		p->blocking_tokens_num++;
1472 	}
1473 	p->blocking = g_list_prepend (p->blocking, GINT_TO_POINTER (tt));
1474 }
1475 
1476 static void
html_tokenizer_blocking_pop(HTMLTokenizer * t)1477 html_tokenizer_blocking_pop (HTMLTokenizer *t)
1478 {
1479 	struct _HTMLTokenizerPrivate *p = t->priv;
1480 
1481 	p->blocking = g_list_remove (p->blocking, p->blocking->data);
1482 
1483 	/* unblock tokenizer */
1484 	if (!p->blocking) {
1485 		p->tokens_num += p->blocking_tokens_num;
1486 		p->blocking_tokens_num = 0;
1487 	}
1488 }
1489 
1490 /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
1491 
1492 void
html_tokenizer_begin(HTMLTokenizer * t,const gchar * content_type)1493 html_tokenizer_begin (HTMLTokenizer *t,
1494                       const gchar *content_type)
1495 {
1496 
1497 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1498 
1499 	g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL], 0, content_type);
1500 }
1501 
1502 void
html_tokenizer_set_engine_type(HTMLTokenizer * t,gboolean engine_type)1503 html_tokenizer_set_engine_type (HTMLTokenizer *t,
1504                                 gboolean engine_type)
1505 {
1506 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1507 
1508 	g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_CHANGEENGINE_SIGNAL], 0, engine_type);
1509 }
1510 
1511 void
html_tokenizer_change_content_type(HTMLTokenizer * t,const gchar * content_type)1512 html_tokenizer_change_content_type (HTMLTokenizer *t,const gchar *content_type)
1513 {
1514 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1515 
1516 	g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_CHANGECONTENT_SIGNAL], 0, content_type);
1517 }
1518 
1519 void
html_tokenizer_end(HTMLTokenizer * t)1520 html_tokenizer_end (HTMLTokenizer *t)
1521 {
1522 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1523 
1524 	g_signal_emit (t, html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL], 0);
1525 }
1526 
1527 void
html_tokenizer_write(HTMLTokenizer * t,const gchar * str,gsize size)1528 html_tokenizer_write (HTMLTokenizer *t,
1529                       const gchar *str,
1530                       gsize size)
1531 {
1532 	HTMLTokenizerClass *klass;
1533 
1534 	g_return_if_fail (t && HTML_IS_TOKENIZER (t));
1535 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1536 
1537 	if (klass->write)
1538 		klass->write (t, str, size);
1539 	else
1540 		g_warning ("No write method defined.");
1541 }
1542 
1543 gchar *
html_tokenizer_peek_token(HTMLTokenizer * t)1544 html_tokenizer_peek_token (HTMLTokenizer *t)
1545 {
1546 	HTMLTokenizerClass *klass;
1547 
1548 	g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1549 
1550 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1551 
1552 	if (klass->peek_token)
1553 		return klass->peek_token (t);
1554 
1555 	g_warning ("No peek_token method defined.");
1556 	return NULL;
1557 
1558 }
1559 
1560 const gchar *
html_tokenizer_get_content_type(HTMLTokenizer * t)1561 html_tokenizer_get_content_type (HTMLTokenizer *t)
1562 {
1563 	HTMLTokenizerClass *klass;
1564 
1565 	g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1566 
1567 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1568 
1569 	if (klass->get_content_type)
1570 		return  klass->get_content_type (t);
1571 
1572 	g_warning ("No get_content_type method defined.");
1573 	return NULL;
1574 
1575 }
1576 
1577 gboolean
html_tokenizer_get_engine_type(HTMLTokenizer * t)1578 html_tokenizer_get_engine_type (HTMLTokenizer *t)
1579 {
1580 	HTMLTokenizerClass *klass;
1581 
1582 	g_return_val_if_fail (t && HTML_IS_TOKENIZER (t),FALSE);
1583 
1584 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1585 
1586 	if (klass->get_engine_type)
1587 		return  klass->get_engine_type (t);
1588 
1589 	g_warning ("No get_engine_type method defined.");
1590 	return FALSE;
1591 }
1592 
1593 gchar *
html_tokenizer_next_token(HTMLTokenizer * t)1594 html_tokenizer_next_token (HTMLTokenizer *t)
1595 {
1596 	HTMLTokenizerClass *klass;
1597 
1598 	g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL);
1599 
1600 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1601 
1602 	if (klass->next_token)
1603 		return klass->next_token (t);
1604 
1605 	g_warning ("No next_token method defined.");
1606 	return NULL;
1607 }
1608 
1609 gboolean
html_tokenizer_has_more_tokens(HTMLTokenizer * t)1610 html_tokenizer_has_more_tokens (HTMLTokenizer *t)
1611 {
1612 	HTMLTokenizerClass *klass;
1613 
1614 	g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), FALSE);
1615 
1616 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1617 
1618 	if (klass->has_more) {
1619 		return klass->has_more (t);
1620 	}
1621 
1622 	g_warning ("No has_more method defined.");
1623 	return FALSE;
1624 
1625 }
1626 
1627 HTMLTokenizer *
html_tokenizer_clone(HTMLTokenizer * t)1628 html_tokenizer_clone (HTMLTokenizer *t)
1629 {
1630 	HTMLTokenizerClass *klass;
1631 
1632 	if (t == NULL)
1633 		return NULL;
1634 	g_return_val_if_fail (HTML_IS_TOKENIZER (t), NULL);
1635 
1636 	klass = HTML_TOKENIZER_CLASS (G_OBJECT_GET_CLASS (t));
1637 
1638 	if (klass->clone)
1639 		return klass->clone (t);
1640 
1641 	g_warning ("No clone method defined.");
1642 	return NULL;
1643 }
1644