1 /*
2  * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
3  *
4  * This library is free software: you can redistribute it and/or modify it
5  * under the terms of the GNU Lesser General Public License as published by
6  * the Free Software Foundation.
7  *
8  * This library is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
10  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
11  * for more details.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this library. If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Authors: Michael Zucchi <notzed@ximian.com>
17  */
18 
19 /* WARNING
20  *
21  * DO NOT USE THIS CODE OUTSIDE OF CAMEL
22  *
23  * IT IS SUBJECT TO CHANGE OR MAY VANISH AT ANY TIME
24  */
25 
26 #include <ctype.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "camel-html-parser.h"
32 
33 /* if defined, must also compile in dump_tag() below somewhere */
34 #define d(x)
35 
36 /* Parser definitions, see below object code for details */
37 
38 struct _CamelHTMLParserPrivate {
39 	gchar *inbuf,
40 		*inptr,
41 		*inend,
42 		*start;
43 	CamelHTMLParserState state;
44 	gchar *charset;
45 	gint eof;
46 	GString *tag;
47 	GString *ent;
48 	gchar ent_utf8[8];
49 	gint attr;
50 	GPtrArray *attrs;
51 	GPtrArray *values;
52 	gint quote;
53 };
54 
55 static void tokenize_setup (void);
56 static CamelHTMLParserPrivate *tokenize_init (void);
57 static void tokenize_free (CamelHTMLParserPrivate *p);
58 static gint tokenize_step (CamelHTMLParserPrivate *p, gchar **datap, gint *lenp);
59 
G_DEFINE_TYPE(CamelHTMLParser,camel_html_parser,G_TYPE_OBJECT)60 G_DEFINE_TYPE (CamelHTMLParser, camel_html_parser, G_TYPE_OBJECT)
61 
62 /* ********************************************************************** */
63 
64 static void
65 html_parser_finalize (GObject *object)
66 {
67 	CamelHTMLParser *parser = CAMEL_HTML_PARSER (object);
68 
69 	tokenize_free (parser->priv);
70 
71 	/* Chain up to parent's finalize() method. */
72 	G_OBJECT_CLASS (camel_html_parser_parent_class)->finalize (object);
73 }
74 
75 static void
camel_html_parser_class_init(CamelHTMLParserClass * class)76 camel_html_parser_class_init (CamelHTMLParserClass *class)
77 {
78 	GObjectClass *object_class;
79 
80 	object_class = G_OBJECT_CLASS (class);
81 	object_class->finalize = html_parser_finalize;
82 
83 	tokenize_setup ();
84 }
85 
86 static void
camel_html_parser_init(CamelHTMLParser * parser)87 camel_html_parser_init (CamelHTMLParser *parser)
88 {
89 	parser->priv = tokenize_init ();
90 }
91 
92 /**
93  * camel_html_parser_new:
94  *
95  * Create a new CamelHTMLParser object.
96  *
97  * Returns: (transfer full): A new #CamelHTMLParser object
98  **/
99 CamelHTMLParser *
camel_html_parser_new(void)100 camel_html_parser_new (void)
101 {
102 	return g_object_new (CAMEL_TYPE_HTML_PARSER, NULL);
103 }
104 
camel_html_parser_set_data(CamelHTMLParser * hp,const gchar * start,gint len,gint last)105 void camel_html_parser_set_data (CamelHTMLParser *hp, const gchar *start, gint len, gint last)
106 {
107 	CamelHTMLParserPrivate *p = hp->priv;
108 
109 	p->inptr = p->inbuf = (gchar *) start;
110 	p->inend = (gchar *) start + len;
111 	p->eof = last;
112 }
113 
camel_html_parser_step(CamelHTMLParser * hp,const gchar ** datap,gint * lenp)114 CamelHTMLParserState camel_html_parser_step (CamelHTMLParser *hp, const gchar **datap, gint *lenp)
115 {
116 	return tokenize_step (hp->priv, (gchar **) datap, lenp);
117 }
118 
camel_html_parser_left(CamelHTMLParser * hp,gint * lenp)119 const gchar *camel_html_parser_left (CamelHTMLParser *hp, gint *lenp)
120 {
121 	CamelHTMLParserPrivate *p = hp->priv;
122 
123 	if (lenp)
124 		*lenp = p->inend - p->inptr;
125 
126 	return p->inptr;
127 }
128 
camel_html_parser_tag(CamelHTMLParser * hp)129 const gchar *camel_html_parser_tag (CamelHTMLParser *hp)
130 {
131 	return hp->priv->tag->str;
132 }
133 
camel_html_parser_attr(CamelHTMLParser * hp,const gchar * name)134 const gchar *camel_html_parser_attr (CamelHTMLParser *hp, const gchar *name)
135 {
136 	gint i;
137 	CamelHTMLParserPrivate *p = hp->priv;
138 
139 	for (i = 0; i < p->attrs->len; i++) {
140 		if (!g_ascii_strcasecmp (((GString *) p->attrs->pdata[i])->str, name)) {
141 			return ((GString *) p->values->pdata[i])->str;
142 		}
143 	}
144 
145 	return NULL;
146 }
147 
148 /**
149  * camel_html_parser_attr_list:
150  * @hp: a #CamelHTMLParser
151  * @values: (nullable) (element-type utf8) (inout): an output #GPtrArray with values, or %NULL
152  *
153  * Provides parsed array of values and attributes. Both arrays are
154  * owned by the @hp.
155  *
156  * Returns: (element-type utf8) (transfer none): a #GPtrArray of parsed attributes
157  *
158  **/
159 const GPtrArray *
camel_html_parser_attr_list(CamelHTMLParser * hp,const GPtrArray ** values)160 camel_html_parser_attr_list (CamelHTMLParser *hp,
161 			     const GPtrArray **values)
162 {
163 	if (values)
164 		*values = hp->priv->values;
165 
166 	return hp->priv->attrs;
167 }
168 
169 /* this map taken out of libxml */
170 static struct {
171 	guint val;
172 	const gchar *name;
173 } entity_map[] = {
174 /*
175  * the 4 absolute ones,
176  */
177 	{ 34,	"quot",	/* quotation mark = APL quote, U+0022 ISOnum */ },
178 	{ 38,	"amp",	/* ampersand, U+0026 ISOnum */ },
179 	{ 60,	"lt",	/* less-than sign, U+003C ISOnum */ },
180 	{ 62,	"gt",	/* greater-than sign, U+003E ISOnum */ },
181 
182 /*
183  * A bunch still in the 128-255 range
184  * Replacing them depend really on the charset used.
185  */
186 	{ 39,	"apos",	/* single quote */ },
187 	{ 160,	"nbsp",	/* no-break space = non-breaking space, U+00A0 ISOnum */ },
188 	{ 161,	"iexcl",/* inverted exclamation mark, U+00A1 ISOnum */ },
189 	{ 162,	"cent",	/* cent sign, U+00A2 ISOnum */ },
190 	{ 163,	"pound",/* pound sign, U+00A3 ISOnum */ },
191 	{ 164,	"curren",/* currency sign, U+00A4 ISOnum */ },
192 	{ 165,	"yen",	/* yen sign = yuan sign, U+00A5 ISOnum */ },
193 	{ 166,	"brvbar",/* broken bar = broken vertical bar, U+00A6 ISOnum */ },
194 	{ 167,	"sect",	/* section sign, U+00A7 ISOnum */ },
195 	{ 168,	"uml",	/* diaeresis = spacing diaeresis, U+00A8 ISOdia */ },
196 	{ 169,	"copy",	/* copyright sign, U+00A9 ISOnum */ },
197 	{ 170,	"ordf",	/* feminine ordinal indicator, U+00AA ISOnum */ },
198 	{ 171,	"laquo",/* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ },
199 	{ 172,	"not",	/* not sign, U+00AC ISOnum */ },
200 	{ 173,	"shy",	/* soft hyphen = discretionary hyphen, U+00AD ISOnum */ },
201 	{ 174,	"reg",	/* registered sign = registered trade mark sign, U+00AE ISOnum */ },
202 	{ 175,	"macr",	/* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ },
203 	{ 176,	"deg",	/* degree sign, U+00B0 ISOnum */ },
204 	{ 177,	"plusmn",/* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ },
205 	{ 178,	"sup2",	/* superscript two = superscript digit two = squared, U+00B2 ISOnum */ },
206 	{ 179,	"sup3",	/* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ },
207 	{ 180,	"acute",/* acute accent = spacing acute, U+00B4 ISOdia */ },
208 	{ 181,	"micro",/* micro sign, U+00B5 ISOnum */ },
209 	{ 182,	"para",	/* pilcrow sign = paragraph sign, U+00B6 ISOnum */ },
210 	{ 183,	"middot",/* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ },
211 	{ 184,	"cedil",/* cedilla = spacing cedilla, U+00B8 ISOdia */ },
212 	{ 185,	"sup1",	/* superscript one = superscript digit one, U+00B9 ISOnum */ },
213 	{ 186,	"ordm",	/* masculine ordinal indicator, U+00BA ISOnum */ },
214 	{ 187,	"raquo",/* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ },
215 	{ 188,	"frac14",/* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ },
216 	{ 189,	"frac12",/* vulgar fraction one half = fraction one half, U+00BD ISOnum */ },
217 	{ 190,	"frac34",/* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ },
218 	{ 191,	"iquest",/* inverted question mark = turned question mark, U+00BF ISOnum */ },
219 	{ 192,	"Agrave",/* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ },
220 	{ 193,	"Aacute",/* latin capital letter A with acute, U+00C1 ISOlat1 */ },
221 	{ 194,	"Acirc",/* latin capital letter A with circumflex, U+00C2 ISOlat1 */ },
222 	{ 195,	"Atilde",/* latin capital letter A with tilde, U+00C3 ISOlat1 */ },
223 	{ 196,	"Auml",	/* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ },
224 	{ 197,	"Aring",/* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ },
225 	{ 198,	"AElig",/* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ },
226 	{ 199,	"Ccedil",/* latin capital letter C with cedilla, U+00C7 ISOlat1 */ },
227 	{ 200,	"Egrave",/* latin capital letter E with grave, U+00C8 ISOlat1 */ },
228 	{ 201,	"Eacute",/* latin capital letter E with acute, U+00C9 ISOlat1 */ },
229 	{ 202,	"Ecirc",/* latin capital letter E with circumflex, U+00CA ISOlat1 */ },
230 	{ 203,	"Euml",	/* latin capital letter E with diaeresis, U+00CB ISOlat1 */ },
231 	{ 204,	"Igrave",/* latin capital letter I with grave, U+00CC ISOlat1 */ },
232 	{ 205,	"Iacute",/* latin capital letter I with acute, U+00CD ISOlat1 */ },
233 	{ 206,	"Icirc",/* latin capital letter I with circumflex, U+00CE ISOlat1 */ },
234 	{ 207,	"Iuml",	/* latin capital letter I with diaeresis, U+00CF ISOlat1 */ },
235 	{ 208,	"ETH",	/* latin capital letter ETH, U+00D0 ISOlat1 */ },
236 	{ 209,	"Ntilde",/* latin capital letter N with tilde, U+00D1 ISOlat1 */ },
237 	{ 210,	"Ograve",/* latin capital letter O with grave, U+00D2 ISOlat1 */ },
238 	{ 211,	"Oacute",/* latin capital letter O with acute, U+00D3 ISOlat1 */ },
239 	{ 212,	"Ocirc",/* latin capital letter O with circumflex, U+00D4 ISOlat1 */ },
240 	{ 213,	"Otilde",/* latin capital letter O with tilde, U+00D5 ISOlat1 */ },
241 	{ 214,	"Ouml",	/* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ },
242 	{ 215,	"times",/* multiplication sign, U+00D7 ISOnum */ },
243 	{ 216,	"Oslash",/* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ },
244 	{ 217,	"Ugrave",/* latin capital letter U with grave, U+00D9 ISOlat1 */ },
245 	{ 218,	"Uacute",/* latin capital letter U with acute, U+00DA ISOlat1 */ },
246 	{ 219,	"Ucirc",/* latin capital letter U with circumflex, U+00DB ISOlat1 */ },
247 	{ 220,	"Uuml",	/* latin capital letter U with diaeresis, U+00DC ISOlat1 */ },
248 	{ 221,	"Yacute",/* latin capital letter Y with acute, U+00DD ISOlat1 */ },
249 	{ 222,	"THORN",/* latin capital letter THORN, U+00DE ISOlat1 */ },
250 	{ 223,	"szlig",/* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ },
251 	{ 224,	"agrave",/* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ },
252 	{ 225,	"aacute",/* latin small letter a with acute, U+00E1 ISOlat1 */ },
253 	{ 226,	"acirc",/* latin small letter a with circumflex, U+00E2 ISOlat1 */ },
254 	{ 227,	"atilde",/* latin small letter a with tilde, U+00E3 ISOlat1 */ },
255 	{ 228,	"auml",	/* latin small letter a with diaeresis, U+00E4 ISOlat1 */ },
256 	{ 229,	"aring",/* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ },
257 	{ 230,	"aelig",/* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ },
258 	{ 231,	"ccedil",/* latin small letter c with cedilla, U+00E7 ISOlat1 */ },
259 	{ 232,	"egrave",/* latin small letter e with grave, U+00E8 ISOlat1 */ },
260 	{ 233,	"eacute",/* latin small letter e with acute, U+00E9 ISOlat1 */ },
261 	{ 234,	"ecirc",/* latin small letter e with circumflex, U+00EA ISOlat1 */ },
262 	{ 235,	"euml",	/* latin small letter e with diaeresis, U+00EB ISOlat1 */ },
263 	{ 236,	"igrave",/* latin small letter i with grave, U+00EC ISOlat1 */ },
264 	{ 237,	"iacute",/* latin small letter i with acute, U+00ED ISOlat1 */ },
265 	{ 238,	"icirc",/* latin small letter i with circumflex, U+00EE ISOlat1 */ },
266 	{ 239,	"iuml",	/* latin small letter i with diaeresis, U+00EF ISOlat1 */ },
267 	{ 240,	"eth",	/* latin small letter eth, U+00F0 ISOlat1 */ },
268 	{ 241,	"ntilde",/* latin small letter n with tilde, U+00F1 ISOlat1 */ },
269 	{ 242,	"ograve",/* latin small letter o with grave, U+00F2 ISOlat1 */ },
270 	{ 243,	"oacute",/* latin small letter o with acute, U+00F3 ISOlat1 */ },
271 	{ 244,	"ocirc",/* latin small letter o with circumflex, U+00F4 ISOlat1 */ },
272 	{ 245,	"otilde",/* latin small letter o with tilde, U+00F5 ISOlat1 */ },
273 	{ 246,	"ouml",	/* latin small letter o with diaeresis, U+00F6 ISOlat1 */ },
274 	{ 247,	"divide",/* division sign, U+00F7 ISOnum */ },
275 	{ 248,	"oslash",/* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ },
276 	{ 249,	"ugrave",/* latin small letter u with grave, U+00F9 ISOlat1 */ },
277 	{ 250,	"uacute",/* latin small letter u with acute, U+00FA ISOlat1 */ },
278 	{ 251,	"ucirc",/* latin small letter u with circumflex, U+00FB ISOlat1 */ },
279 	{ 252,	"uuml",	/* latin small letter u with diaeresis, U+00FC ISOlat1 */ },
280 	{ 253,	"yacute",/* latin small letter y with acute, U+00FD ISOlat1 */ },
281 	{ 254,	"thorn",/* latin small letter thorn with, U+00FE ISOlat1 */ },
282 	{ 255,	"yuml",	/* latin small letter y with diaeresis, U+00FF ISOlat1 */ },
283 
284 /*
285  * Anything below should really be kept as entities references
286  */
287 	{ 402,	"fnof",	/* latin small f with hook = function = florin, U+0192 ISOtech */ },
288 
289 	{ 913,	"Alpha",/* greek capital letter alpha, U+0391 */ },
290 	{ 914,	"Beta",	/* greek capital letter beta, U+0392 */ },
291 	{ 915,	"Gamma",/* greek capital letter gamma, U+0393 ISOgrk3 */ },
292 	{ 916,	"Delta",/* greek capital letter delta, U+0394 ISOgrk3 */ },
293 	{ 917,	"Epsilon",/* greek capital letter epsilon, U+0395 */ },
294 	{ 918,	"Zeta",	/* greek capital letter zeta, U+0396 */ },
295 	{ 919,	"Eta",	/* greek capital letter eta, U+0397 */ },
296 	{ 920,	"Theta",/* greek capital letter theta, U+0398 ISOgrk3 */ },
297 	{ 921,	"Iota",	/* greek capital letter iota, U+0399 */ },
298 	{ 922,	"Kappa",/* greek capital letter kappa, U+039A */ },
299 	{ 923,	"Lambda"/* greek capital letter lambda, U+039B ISOgrk3 */ },
300 	{ 924,	"Mu",	/* greek capital letter mu, U+039C */ },
301 	{ 925,	"Nu",	/* greek capital letter nu, U+039D */ },
302 	{ 926,	"Xi",	/* greek capital letter xi, U+039E ISOgrk3 */ },
303 	{ 927,	"Omicron",/* greek capital letter omicron, U+039F */ },
304 	{ 928,	"Pi",	/* greek capital letter pi, U+03A0 ISOgrk3 */ },
305 	{ 929,	"Rho",	/* greek capital letter rho, U+03A1 */ },
306 	{ 931,	"Sigma",/* greek capital letter sigma, U+03A3 ISOgrk3 */ },
307 	{ 932,	"Tau",	/* greek capital letter tau, U+03A4 */ },
308 	{ 933,	"Upsilon",/* greek capital letter upsilon, U+03A5 ISOgrk3 */ },
309 	{ 934,	"Phi",	/* greek capital letter phi, U+03A6 ISOgrk3 */ },
310 	{ 935,	"Chi",	/* greek capital letter chi, U+03A7 */ },
311 	{ 936,	"Psi",	/* greek capital letter psi, U+03A8 ISOgrk3 */ },
312 	{ 937,	"Omega",/* greek capital letter omega, U+03A9 ISOgrk3 */ },
313 
314 	{ 945,	"alpha",/* greek small letter alpha, U+03B1 ISOgrk3 */ },
315 	{ 946,	"beta",	/* greek small letter beta, U+03B2 ISOgrk3 */ },
316 	{ 947,	"gamma",/* greek small letter gamma, U+03B3 ISOgrk3 */ },
317 	{ 948,	"delta",/* greek small letter delta, U+03B4 ISOgrk3 */ },
318 	{ 949,	"epsilon",/* greek small letter epsilon, U+03B5 ISOgrk3 */ },
319 	{ 950,	"zeta",	/* greek small letter zeta, U+03B6 ISOgrk3 */ },
320 	{ 951,	"eta",	/* greek small letter eta, U+03B7 ISOgrk3 */ },
321 	{ 952,	"theta",/* greek small letter theta, U+03B8 ISOgrk3 */ },
322 	{ 953,	"iota",	/* greek small letter iota, U+03B9 ISOgrk3 */ },
323 	{ 954,	"kappa",/* greek small letter kappa, U+03BA ISOgrk3 */ },
324 	{ 955,	"lambda",/* greek small letter lambda, U+03BB ISOgrk3 */ },
325 	{ 956,	"mu",	/* greek small letter mu, U+03BC ISOgrk3 */ },
326 	{ 957,	"nu",	/* greek small letter nu, U+03BD ISOgrk3 */ },
327 	{ 958,	"xi",	/* greek small letter xi, U+03BE ISOgrk3 */ },
328 	{ 959,	"omicron",/* greek small letter omicron, U+03BF NEW */ },
329 	{ 960,	"pi",	/* greek small letter pi, U+03C0 ISOgrk3 */ },
330 	{ 961,	"rho",	/* greek small letter rho, U+03C1 ISOgrk3 */ },
331 	{ 962,	"sigmaf",/* greek small letter final sigma, U+03C2 ISOgrk3 */ },
332 	{ 963,	"sigma",/* greek small letter sigma, U+03C3 ISOgrk3 */ },
333 	{ 964,	"tau",	/* greek small letter tau, U+03C4 ISOgrk3 */ },
334 	{ 965,	"upsilon",/* greek small letter upsilon, U+03C5 ISOgrk3 */ },
335 	{ 966,	"phi",	/* greek small letter phi, U+03C6 ISOgrk3 */ },
336 	{ 967,	"chi",	/* greek small letter chi, U+03C7 ISOgrk3 */ },
337 	{ 968,	"psi",	/* greek small letter psi, U+03C8 ISOgrk3 */ },
338 	{ 969,	"omega",/* greek small letter omega, U+03C9 ISOgrk3 */ },
339 	{ 977,	"thetasym",/* greek small letter theta symbol, U+03D1 NEW */ },
340 	{ 978,	"upsih",/* greek upsilon with hook symbol, U+03D2 NEW */ },
341 	{ 982,	"piv",	/* greek pi symbol, U+03D6 ISOgrk3 */ },
342 
343 	{ 8226,	"bull",	/* bullet = black small circle, U+2022 ISOpub */ },
344 	{ 8230,	"hellip",/* horizontal ellipsis = three dot leader, U+2026 ISOpub */ },
345 	{ 8242,	"prime",/* prime = minutes = feet, U+2032 ISOtech */ },
346 	{ 8243,	"Prime",/* double prime = seconds = inches, U+2033 ISOtech */ },
347 	{ 8254,	"oline",/* overline = spacing overscore, U+203E NEW */ },
348 	{ 8260,	"frasl",/* fraction slash, U+2044 NEW */ },
349 
350 	{ 8472,	"weierp",/* script capital P = power set = Weierstrass p, U+2118 ISOamso */ },
351 	{ 8465,	"image",/* blackletter capital I = imaginary part, U+2111 ISOamso */ },
352 	{ 8476,	"real",	/* blackletter capital R = real part symbol, U+211C ISOamso */ },
353 	{ 8482,	"trade",/* trade mark sign, U+2122 ISOnum */ },
354 	{ 8501,	"alefsym",/* alef symbol = first transfinite cardinal, U+2135 NEW */ },
355 	{ 8592,	"larr",	/* leftwards arrow, U+2190 ISOnum */ },
356 	{ 8593,	"uarr",	/* upwards arrow, U+2191 ISOnum */ },
357 	{ 8594,	"rarr",	/* rightwards arrow, U+2192 ISOnum */ },
358 	{ 8595,	"darr",	/* downwards arrow, U+2193 ISOnum */ },
359 	{ 8596,	"harr",	/* left right arrow, U+2194 ISOamsa */ },
360 	{ 8629,	"crarr",/* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ },
361 	{ 8656,	"lArr",	/* leftwards double arrow, U+21D0 ISOtech */ },
362 	{ 8657,	"uArr",	/* upwards double arrow, U+21D1 ISOamsa */ },
363 	{ 8658,	"rArr",	/* rightwards double arrow, U+21D2 ISOtech */ },
364 	{ 8659,	"dArr",	/* downwards double arrow, U+21D3 ISOamsa */ },
365 	{ 8660,	"hArr",	/* left right double arrow, U+21D4 ISOamsa */ },
366 
367 	{ 8704,	"forall",/* for all, U+2200 ISOtech */ },
368 	{ 8706,	"part",	/* partial differential, U+2202 ISOtech */ },
369 	{ 8707,	"exist",/* there exists, U+2203 ISOtech */ },
370 	{ 8709,	"empty",/* empty set = null set = diameter, U+2205 ISOamso */ },
371 	{ 8711,	"nabla",/* nabla = backward difference, U+2207 ISOtech */ },
372 	{ 8712,	"isin",	/* element of, U+2208 ISOtech */ },
373 	{ 8713,	"notin",/* not an element of, U+2209 ISOtech */ },
374 	{ 8715,	"ni",	/* contains as member, U+220B ISOtech */ },
375 	{ 8719,	"prod",	/* n-ary product = product sign, U+220F ISOamsb */ },
376 	{ 8721,	"sum",	/* n-ary sumation, U+2211 ISOamsb */ },
377 	{ 8722,	"minus",/* minus sign, U+2212 ISOtech */ },
378 	{ 8727,	"lowast",/* asterisk operator, U+2217 ISOtech */ },
379 	{ 8730,	"radic",/* square root = radical sign, U+221A ISOtech */ },
380 	{ 8733,	"prop",	/* proportional to, U+221D ISOtech */ },
381 	{ 8734,	"infin",/* infinity, U+221E ISOtech */ },
382 	{ 8736,	"ang",	/* angle, U+2220 ISOamso */ },
383 	{ 8743,	"and",	/* logical and = wedge, U+2227 ISOtech */ },
384 	{ 8744,	"or",	/* logical or = vee, U+2228 ISOtech */ },
385 	{ 8745,	"cap",	/* intersection = cap, U+2229 ISOtech */ },
386 	{ 8746,	"cup",	/* union = cup, U+222A ISOtech */ },
387 	{ 8747,	"int",	/* integral, U+222B ISOtech */ },
388 	{ 8756,	"there4",/* therefore, U+2234 ISOtech */ },
389 	{ 8764,	"sim",	/* tilde operator = varies with = similar to, U+223C ISOtech */ },
390 	{ 8773,	"cong",	/* approximately equal to, U+2245 ISOtech */ },
391 	{ 8776,	"asymp",/* almost equal to = asymptotic to, U+2248 ISOamsr */ },
392 	{ 8800,	"ne",	/* not equal to, U+2260 ISOtech */ },
393 	{ 8801,	"equiv",/* identical to, U+2261 ISOtech */ },
394 	{ 8804,	"le",	/* less-than or equal to, U+2264 ISOtech */ },
395 	{ 8805,	"ge",	/* greater-than or equal to, U+2265 ISOtech */ },
396 	{ 8834,	"sub",	/* subset of, U+2282 ISOtech */ },
397 	{ 8835,	"sup",	/* superset of, U+2283 ISOtech */ },
398 	{ 8836,	"nsub",	/* not a subset of, U+2284 ISOamsn */ },
399 	{ 8838,	"sube",	/* subset of or equal to, U+2286 ISOtech */ },
400 	{ 8839,	"supe",	/* superset of or equal to, U+2287 ISOtech */ },
401 	{ 8853,	"oplus",/* circled plus = direct sum, U+2295 ISOamsb */ },
402 	{ 8855,	"otimes",/* circled times = vector product, U+2297 ISOamsb */ },
403 	{ 8869,	"perp",	/* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ },
404 	{ 8901,	"sdot",	/* dot operator, U+22C5 ISOamsb */ },
405 	{ 8968,	"lceil",/* left ceiling = apl upstile, U+2308 ISOamsc */ },
406 	{ 8969,	"rceil",/* right ceiling, U+2309 ISOamsc */ },
407 	{ 8970,	"lfloor",/* left floor = apl downstile, U+230A ISOamsc */ },
408 	{ 8971,	"rfloor",/* right floor, U+230B ISOamsc */ },
409 	{ 9001,	"lang",	/* left-pointing angle bracket = bra, U+2329 ISOtech */ },
410 	{ 9002,	"rang",	/* right-pointing angle bracket = ket, U+232A ISOtech */ },
411 	{ 9674,	"loz",	/* lozenge, U+25CA ISOpub */ },
412 
413 	{ 9824,	"spades",/* black spade suit, U+2660 ISOpub */ },
414 	{ 9827,	"clubs",/* black club suit = shamrock, U+2663 ISOpub */ },
415 	{ 9829,	"hearts",/* black heart suit = valentine, U+2665 ISOpub */ },
416 	{ 9830,	"diams",/* black diamond suit, U+2666 ISOpub */ },
417 
418 	{ 338,	"OElig",/* latin capital ligature OE, U+0152 ISOlat2 */ },
419 	{ 339,	"oelig",/* latin small ligature oe, U+0153 ISOlat2 */ },
420 	{ 352,	"Scaron",/* latin capital letter S with caron, U+0160 ISOlat2 */ },
421 	{ 353,	"scaron",/* latin small letter s with caron, U+0161 ISOlat2 */ },
422 	{ 376,	"Yuml",	/* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ },
423 	{ 710,	"circ",	/* modifier letter circumflex accent, U+02C6 ISOpub */ },
424 	{ 732,	"tilde",/* small tilde, U+02DC ISOdia */ },
425 
426 	{ 8194,	"ensp",	/* en space, U+2002 ISOpub */ },
427 	{ 8195,	"emsp",	/* em space, U+2003 ISOpub */ },
428 	{ 8201,	"thinsp",/* thin space, U+2009 ISOpub */ },
429 	{ 8204,	"zwnj",	/* zero width non-joiner, U+200C NEW RFC 2070 */ },
430 	{ 8205,	"zwj",	/* zero width joiner, U+200D NEW RFC 2070 */ },
431 	{ 8206,	"lrm",	/* left-to-right mark, U+200E NEW RFC 2070 */ },
432 	{ 8207,	"rlm",	/* right-to-left mark, U+200F NEW RFC 2070 */ },
433 	{ 8211,	"ndash",/* en dash, U+2013 ISOpub */ },
434 	{ 8212,	"mdash",/* em dash, U+2014 ISOpub */ },
435 	{ 8216,	"lsquo",/* left single quotation mark, U+2018 ISOnum */ },
436 	{ 8217,	"rsquo",/* right single quotation mark, U+2019 ISOnum */ },
437 	{ 8218,	"sbquo",/* single low-9 quotation mark, U+201A NEW */ },
438 	{ 8220,	"ldquo",/* left double quotation mark, U+201C ISOnum */ },
439 	{ 8221,	"rdquo",/* right double quotation mark, U+201D ISOnum */ },
440 	{ 8222,	"bdquo",/* double low-9 quotation mark, U+201E NEW */ },
441 	{ 8224,	"dagger",/* dagger, U+2020 ISOpub */ },
442 	{ 8225,	"Dagger",/* double dagger, U+2021 ISOpub */ },
443 	{ 8240,	"permil",/* per mille sign, U+2030 ISOtech */ },
444 	{ 8249,	"lsaquo",/* single left-pointing angle quotation mark, U+2039 ISO proposed */ },
445 	{ 8250,	"rsaquo",/* single right-pointing angle quotation mark, U+203A ISO proposed */ },
446 	{ 8364,	"euro",	/* euro sign, U+20AC NEW */ }
447 };
448 
449 static GHashTable *entities;
450 
451 /* this cannot be called in a thread context */
tokenize_setup(void)452 static void tokenize_setup (void)
453 {
454 	gint i;
455 
456 	if (entities == NULL) {
457 		entities = g_hash_table_new (g_str_hash, g_str_equal);
458 		for (i = 0; i < G_N_ELEMENTS (entity_map); i++) {
459 			g_hash_table_insert (entities, (gchar *) entity_map[i].name, GUINT_TO_POINTER (entity_map[i].val));
460 		}
461 	}
462 }
463 
tokenize_init(void)464 static CamelHTMLParserPrivate *tokenize_init (void)
465 {
466 	CamelHTMLParserPrivate *p;
467 
468 	p = g_malloc (sizeof (*p));
469 	p->state = CAMEL_HTML_PARSER_DATA;
470 
471 	p->attr = 0;
472 	p->attrs = g_ptr_array_new ();
473 	p->values = g_ptr_array_new ();
474 	p->tag = g_string_new ("");
475 	p->ent = g_string_new ("");
476 	p->charset = NULL;
477 
478 	if (entities == NULL)
479 		tokenize_setup ();
480 
481 	return p;
482 }
483 
tokenize_free(CamelHTMLParserPrivate * p)484 static void tokenize_free (CamelHTMLParserPrivate *p)
485 {
486 	gint i;
487 
488 	g_string_free (p->tag, TRUE);
489 	g_string_free (p->ent, TRUE);
490 	g_free (p->charset);
491 
492 	for (i = 0; i < p->attrs->len; i++)
493 		g_string_free (p->attrs->pdata[i], TRUE);
494 
495 	for (i = 0; i < p->values->len; i++)
496 		g_string_free (p->values->pdata[i], TRUE);
497 
498 	g_free (p);
499 }
500 
convert_entity(const gchar * e,gchar * ent)501 static gint convert_entity (const gchar *e, gchar *ent)
502 {
503 	guint val;
504 
505 	if (e[0] == '#')
506 		return g_unichar_to_utf8 (atoi (e + 1), ent);
507 
508 	val = GPOINTER_TO_UINT (g_hash_table_lookup (entities, e));
509 	if (ent)
510 		return g_unichar_to_utf8 (val, ent);
511 	else
512 		return 0;
513 }
514 
515 #if 0
516 static void dump_tag (CamelHTMLParserPrivate *p)
517 {
518 	gint i;
519 
520 	printf ("got tag: %s\n", p->tag->str);
521 	printf ("%d attributes:\n", p->attr);
522 	for (i = 0; i < p->attr; i++) {
523 		printf (" %s = '%s'\n", ((GString *) p->attrs->pdata[i])->str, ((GString *) p->values->pdata[i])->str);
524 	}
525 }
526 #endif
527 
tokenize_step(CamelHTMLParserPrivate * p,gchar ** datap,gint * lenp)528 static gint tokenize_step (CamelHTMLParserPrivate *p, gchar **datap, gint *lenp)
529 {
530 	gchar *in = p->inptr;
531 	gchar *inend = p->inend;
532 	gchar c;
533 	gint state = p->state, ret, len;
534 	gchar *start = p->inptr;
535 
536 	d (printf ("Tokenise step\n"));
537 
538 	while (in < inend) {
539 		c = *in++;
540 		switch (state) {
541 		case CAMEL_HTML_PARSER_DATA:
542 			if (c == '<') {
543 				ret = state;
544 				state = CAMEL_HTML_PARSER_TAG;
545 				p->attr = 0;
546 				g_string_truncate (p->tag, 0);
547 				d (printf ("got data '%.*s'\n", in - start - 1, start));
548 				*datap = start;
549 				*lenp = in-start-1;
550 				goto done;
551 			} else if (c == '&') {
552 				ret = state;
553 				state = CAMEL_HTML_PARSER_ENT;
554 				g_string_truncate (p->ent, 0);
555 				g_string_append_c (p->ent, c);
556 				d (printf ("got data '%.*s'\n", in - start - 1, start));
557 				*datap = start;
558 				*lenp = in-start-1;
559 				goto done;
560 			}
561 			break;
562 		case CAMEL_HTML_PARSER_ENT:
563 			if (c == ';') {
564 				len = convert_entity (p->ent->str + 1, p->ent_utf8);
565 				if (len == 0) {
566 					/* handle broken entity */
567 					g_string_append_c (p->ent, c);
568 					ret = state = CAMEL_HTML_PARSER_DATA;
569 					*datap = p->ent->str;
570 					*lenp = p->ent->len;
571 					goto done;
572 				} else {
573 					d (printf ("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
574 					ret = state;
575 					state = CAMEL_HTML_PARSER_DATA;
576 					*datap = p->ent_utf8;
577 					*lenp = len;
578 					goto done;
579 				}
580 			} else if (isalnum (c) || c=='#') { /* FIXME: right type */
581 				g_string_append_c (p->ent, c);
582 			} else {
583 				/* handle broken entity */
584 				g_string_append_c (p->ent, c);
585 				ret = state = CAMEL_HTML_PARSER_DATA;
586 				*datap = p->ent->str;
587 				*lenp = p->ent->len;
588 				goto done;
589 			}
590 			break;
591 		case CAMEL_HTML_PARSER_TAG:
592 			if (c == '!') {
593 				state = CAMEL_HTML_PARSER_COMMENT0;
594 				g_string_append_c (p->tag, c);
595 			} else if (c == '>') {
596 				d (dump_tag (p));
597 				ret = CAMEL_HTML_PARSER_ELEMENT;
598 				state = CAMEL_HTML_PARSER_DATA;
599 				goto done;
600 			} else if (c == ' ' || c == '\n' || c == '\t') {
601 				state = CAMEL_HTML_PARSER_ATTR0;
602 			} else {
603 				g_string_append_c (p->tag, c);
604 			}
605 			break;
606 			/* check for <!-- */
607 		case CAMEL_HTML_PARSER_COMMENT0:
608 			if (c == '-') {
609 				g_string_append_c (p->tag, c);
610 				if (p->tag->len == 3) {
611 					g_string_truncate (p->tag, 0);
612 					state = CAMEL_HTML_PARSER_COMMENT;
613 				}
614 			} else {
615 				/* got something else, probbly dtd entity */
616 				state = CAMEL_HTML_PARSER_DTDENT;
617 			}
618 			break;
619 		case CAMEL_HTML_PARSER_DTDENT:
620 			if (c == '>') {
621 				ret = CAMEL_HTML_PARSER_DTDENT;
622 				state = CAMEL_HTML_PARSER_DATA;
623 				*datap = start;
624 				*lenp = in-start-1;
625 				goto done;
626 			}
627 			break;
628 		case CAMEL_HTML_PARSER_COMMENT:
629 			if (c == '>' && p->tag->len == 2) {
630 				ret = CAMEL_HTML_PARSER_COMMENT;
631 				state = CAMEL_HTML_PARSER_DATA;
632 				*datap = start;
633 				*lenp = in-start-1;
634 				goto done;
635 			} else if (c == '-') {
636 				/* we dont care if we get 'n' --'s before the > */
637 				if (p->tag->len < 2)
638 					g_string_append_c (p->tag, c);
639 			} else {
640 				g_string_truncate (p->tag, 0);
641 			}
642 			break;
643 		case CAMEL_HTML_PARSER_ATTR0:	/* pre-attribute whitespace */
644 			if (c == '>') {
645 				d (dump_tag (p));
646 				ret = CAMEL_HTML_PARSER_ELEMENT;
647 				state = CAMEL_HTML_PARSER_DATA;
648 				goto done;
649 			} else if (c == ' ' || c == '\n' || c == '\t') {
650 			} else {
651 				if (p->attrs->len <= p->attr) {
652 					g_ptr_array_add (p->attrs, g_string_new (""));
653 					g_ptr_array_add (p->values, g_string_new (""));
654 				} else {
655 					g_string_truncate (p->attrs->pdata[p->attr], 0);
656 					g_string_truncate (p->values->pdata[p->attr], 0);
657 				}
658 				g_string_append_c (p->attrs->pdata[p->attr], c);
659 				state = CAMEL_HTML_PARSER_ATTR;
660 			}
661 			break;
662 		case CAMEL_HTML_PARSER_ATTR:
663 			if (c == '>') {
664 				d (dump_tag (p));
665 				ret = CAMEL_HTML_PARSER_ELEMENT;
666 				state = CAMEL_HTML_PARSER_DATA;
667 				goto done;
668 			} else if (c == '=') {
669 				state = CAMEL_HTML_PARSER_VAL0;
670 			} else if (c == ' ' || c == '\n' || c == '\t') {
671 				state = CAMEL_HTML_PARSER_ATTR0;
672 				p->attr++;
673 			} else {
674 				g_string_append_c (p->attrs->pdata[p->attr], c);
675 			}
676 			break;
677 		case CAMEL_HTML_PARSER_VAL0:
678 			if (c == '>') {
679 				d (printf ("value truncated\n"));
680 				d (dump_tag (p));
681 				ret = CAMEL_HTML_PARSER_ELEMENT;
682 				state = CAMEL_HTML_PARSER_DATA;
683 				goto done;
684 			} else if (c == '\'' || c == '\"') {
685 				p->quote = c;
686 				state = CAMEL_HTML_PARSER_VAL;
687 			} else if (c == ' ' || c == '\n' || c == '\t') {
688 			} else {
689 				g_string_append_c (p->values->pdata[p->attr], c);
690 				p->quote = 0;
691 				state = CAMEL_HTML_PARSER_VAL;
692 			}
693 			break;
694 		case CAMEL_HTML_PARSER_VAL:
695 		do_val:
696 			if (p->quote) {
697 				if (c == '>') {
698 					d (printf ("value truncated\n"));
699 					d (dump_tag (p));
700 					ret = CAMEL_HTML_PARSER_ELEMENT;
701 					state = CAMEL_HTML_PARSER_DATA;
702 					p->attr++;
703 					goto done;
704 				} else if (c == p->quote) {
705 					state = CAMEL_HTML_PARSER_ATTR0;
706 					p->attr++;
707 				} else if (c == '&') {
708 					state = CAMEL_HTML_PARSER_VAL_ENT;
709 					g_string_truncate (p->ent, 0);
710 				} else {
711 					g_string_append_c (p->values->pdata[p->attr], c);
712 				}
713 			} else if (c == '>') {
714 				d (dump_tag (p));
715 				ret = CAMEL_HTML_PARSER_ELEMENT;
716 				state = CAMEL_HTML_PARSER_DATA;
717 				p->attr++;
718 				goto done;
719 			} else if (c == ' ' || c == '\n' || c == '\t') {
720 				state = CAMEL_HTML_PARSER_ATTR0;
721 				p->attr++;
722 			} else if (c == '&') {
723 				state = CAMEL_HTML_PARSER_VAL_ENT;
724 				g_string_truncate (p->ent, 0);
725 			} else {
726 				g_string_append_c (p->values->pdata[p->attr], c);
727 			}
728 			break;
729 		case CAMEL_HTML_PARSER_VAL_ENT:
730 			if (c == ';') {
731 				state = CAMEL_HTML_PARSER_VAL;
732 				len = convert_entity (p->ent->str + 1, p->ent_utf8);
733 				if (len == 0) {
734 					/* fallback; broken entity, just output it and see why we ended */
735 					g_string_append (p->values->pdata[p->attr], p->ent->str);
736 					g_string_append_c (p->values->pdata[p->attr], ';');
737 				} else {
738 					d (printf ("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
739 					g_string_append_len (p->values->pdata[p->attr], p->ent_utf8, len);
740 				}
741 			} else if (isalnum (c) || c=='#') { /* FIXME: right type */
742 				g_string_append_c (p->ent, c);
743 			} else {
744 				/* fallback; broken entity, just output it and see why we ended */
745 				g_string_append (p->values->pdata[p->attr], p->ent->str);
746 				goto do_val;
747 			}
748 			break;
749 		}
750 	}
751 
752 	if (p->eof) {
753 		/* FIXME: what about other truncated states? */
754 		switch (state) {
755 		case CAMEL_HTML_PARSER_DATA:
756 		case CAMEL_HTML_PARSER_COMMENT:
757 			if (in > start) {
758 				ret = state;
759 				*datap = start;
760 				*lenp = in-start-1;
761 			} else {
762 				ret = CAMEL_HTML_PARSER_EOF;
763 				state = CAMEL_HTML_PARSER_EOF;
764 			}
765 			break;
766 		default:
767 			ret = CAMEL_HTML_PARSER_EOF;
768 			state = CAMEL_HTML_PARSER_EOF;
769 		}
770 	} else {
771 		/* we only care about remaining data for this buffer, everything else has its own copy */
772 		switch (state) {
773 		case CAMEL_HTML_PARSER_DATA:
774 		case CAMEL_HTML_PARSER_COMMENT:
775 			if (in > start) {
776 				ret = state;
777 				*datap = start;
778 				*lenp = in-start-1;
779 			} else {
780 				ret = CAMEL_HTML_PARSER_EOD;
781 			}
782 			break;
783 		default:
784 			ret = CAMEL_HTML_PARSER_EOD;
785 		}
786 	}
787 
788 done:
789 	p->start = start;
790 	p->state = state;
791 	p->inptr = in;
792 
793 	return ret;
794 }
795