1 /*
2 * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
3 *
4 * This library is free software: you can redistribute it and/or modify it
5 * under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
10 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
11 * for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this library. If not, see <http://www.gnu.org/licenses/>.
15 *
16 * Authors: Michael Zucchi <notzed@ximian.com>
17 */
18
19 /* WARNING
20 *
21 * DO NOT USE THIS CODE OUTSIDE OF CAMEL
22 *
23 * IT IS SUBJECT TO CHANGE OR MAY VANISH AT ANY TIME
24 */
25
26 #include <ctype.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "camel-html-parser.h"
32
33 /* if defined, must also compile in dump_tag() below somewhere */
34 #define d(x)
35
36 /* Parser definitions, see below object code for details */
37
38 struct _CamelHTMLParserPrivate {
39 gchar *inbuf,
40 *inptr,
41 *inend,
42 *start;
43 CamelHTMLParserState state;
44 gchar *charset;
45 gint eof;
46 GString *tag;
47 GString *ent;
48 gchar ent_utf8[8];
49 gint attr;
50 GPtrArray *attrs;
51 GPtrArray *values;
52 gint quote;
53 };
54
55 static void tokenize_setup (void);
56 static CamelHTMLParserPrivate *tokenize_init (void);
57 static void tokenize_free (CamelHTMLParserPrivate *p);
58 static gint tokenize_step (CamelHTMLParserPrivate *p, gchar **datap, gint *lenp);
59
G_DEFINE_TYPE(CamelHTMLParser,camel_html_parser,G_TYPE_OBJECT)60 G_DEFINE_TYPE (CamelHTMLParser, camel_html_parser, G_TYPE_OBJECT)
61
62 /* ********************************************************************** */
63
64 static void
65 html_parser_finalize (GObject *object)
66 {
67 CamelHTMLParser *parser = CAMEL_HTML_PARSER (object);
68
69 tokenize_free (parser->priv);
70
71 /* Chain up to parent's finalize() method. */
72 G_OBJECT_CLASS (camel_html_parser_parent_class)->finalize (object);
73 }
74
75 static void
camel_html_parser_class_init(CamelHTMLParserClass * class)76 camel_html_parser_class_init (CamelHTMLParserClass *class)
77 {
78 GObjectClass *object_class;
79
80 object_class = G_OBJECT_CLASS (class);
81 object_class->finalize = html_parser_finalize;
82
83 tokenize_setup ();
84 }
85
86 static void
camel_html_parser_init(CamelHTMLParser * parser)87 camel_html_parser_init (CamelHTMLParser *parser)
88 {
89 parser->priv = tokenize_init ();
90 }
91
92 /**
93 * camel_html_parser_new:
94 *
95 * Create a new CamelHTMLParser object.
96 *
97 * Returns: (transfer full): A new #CamelHTMLParser object
98 **/
99 CamelHTMLParser *
camel_html_parser_new(void)100 camel_html_parser_new (void)
101 {
102 return g_object_new (CAMEL_TYPE_HTML_PARSER, NULL);
103 }
104
camel_html_parser_set_data(CamelHTMLParser * hp,const gchar * start,gint len,gint last)105 void camel_html_parser_set_data (CamelHTMLParser *hp, const gchar *start, gint len, gint last)
106 {
107 CamelHTMLParserPrivate *p = hp->priv;
108
109 p->inptr = p->inbuf = (gchar *) start;
110 p->inend = (gchar *) start + len;
111 p->eof = last;
112 }
113
camel_html_parser_step(CamelHTMLParser * hp,const gchar ** datap,gint * lenp)114 CamelHTMLParserState camel_html_parser_step (CamelHTMLParser *hp, const gchar **datap, gint *lenp)
115 {
116 return tokenize_step (hp->priv, (gchar **) datap, lenp);
117 }
118
camel_html_parser_left(CamelHTMLParser * hp,gint * lenp)119 const gchar *camel_html_parser_left (CamelHTMLParser *hp, gint *lenp)
120 {
121 CamelHTMLParserPrivate *p = hp->priv;
122
123 if (lenp)
124 *lenp = p->inend - p->inptr;
125
126 return p->inptr;
127 }
128
camel_html_parser_tag(CamelHTMLParser * hp)129 const gchar *camel_html_parser_tag (CamelHTMLParser *hp)
130 {
131 return hp->priv->tag->str;
132 }
133
camel_html_parser_attr(CamelHTMLParser * hp,const gchar * name)134 const gchar *camel_html_parser_attr (CamelHTMLParser *hp, const gchar *name)
135 {
136 gint i;
137 CamelHTMLParserPrivate *p = hp->priv;
138
139 for (i = 0; i < p->attrs->len; i++) {
140 if (!g_ascii_strcasecmp (((GString *) p->attrs->pdata[i])->str, name)) {
141 return ((GString *) p->values->pdata[i])->str;
142 }
143 }
144
145 return NULL;
146 }
147
148 /**
149 * camel_html_parser_attr_list:
150 * @hp: a #CamelHTMLParser
151 * @values: (nullable) (element-type utf8) (inout): an output #GPtrArray with values, or %NULL
152 *
153 * Provides parsed array of values and attributes. Both arrays are
154 * owned by the @hp.
155 *
156 * Returns: (element-type utf8) (transfer none): a #GPtrArray of parsed attributes
157 *
158 **/
159 const GPtrArray *
camel_html_parser_attr_list(CamelHTMLParser * hp,const GPtrArray ** values)160 camel_html_parser_attr_list (CamelHTMLParser *hp,
161 const GPtrArray **values)
162 {
163 if (values)
164 *values = hp->priv->values;
165
166 return hp->priv->attrs;
167 }
168
169 /* this map taken out of libxml */
170 static struct {
171 guint val;
172 const gchar *name;
173 } entity_map[] = {
174 /*
175 * the 4 absolute ones,
176 */
177 { 34, "quot", /* quotation mark = APL quote, U+0022 ISOnum */ },
178 { 38, "amp", /* ampersand, U+0026 ISOnum */ },
179 { 60, "lt", /* less-than sign, U+003C ISOnum */ },
180 { 62, "gt", /* greater-than sign, U+003E ISOnum */ },
181
182 /*
183 * A bunch still in the 128-255 range
184 * Replacing them depend really on the charset used.
185 */
186 { 39, "apos", /* single quote */ },
187 { 160, "nbsp", /* no-break space = non-breaking space, U+00A0 ISOnum */ },
188 { 161, "iexcl",/* inverted exclamation mark, U+00A1 ISOnum */ },
189 { 162, "cent", /* cent sign, U+00A2 ISOnum */ },
190 { 163, "pound",/* pound sign, U+00A3 ISOnum */ },
191 { 164, "curren",/* currency sign, U+00A4 ISOnum */ },
192 { 165, "yen", /* yen sign = yuan sign, U+00A5 ISOnum */ },
193 { 166, "brvbar",/* broken bar = broken vertical bar, U+00A6 ISOnum */ },
194 { 167, "sect", /* section sign, U+00A7 ISOnum */ },
195 { 168, "uml", /* diaeresis = spacing diaeresis, U+00A8 ISOdia */ },
196 { 169, "copy", /* copyright sign, U+00A9 ISOnum */ },
197 { 170, "ordf", /* feminine ordinal indicator, U+00AA ISOnum */ },
198 { 171, "laquo",/* left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum */ },
199 { 172, "not", /* not sign, U+00AC ISOnum */ },
200 { 173, "shy", /* soft hyphen = discretionary hyphen, U+00AD ISOnum */ },
201 { 174, "reg", /* registered sign = registered trade mark sign, U+00AE ISOnum */ },
202 { 175, "macr", /* macron = spacing macron = overline = APL overbar, U+00AF ISOdia */ },
203 { 176, "deg", /* degree sign, U+00B0 ISOnum */ },
204 { 177, "plusmn",/* plus-minus sign = plus-or-minus sign, U+00B1 ISOnum */ },
205 { 178, "sup2", /* superscript two = superscript digit two = squared, U+00B2 ISOnum */ },
206 { 179, "sup3", /* superscript three = superscript digit three = cubed, U+00B3 ISOnum */ },
207 { 180, "acute",/* acute accent = spacing acute, U+00B4 ISOdia */ },
208 { 181, "micro",/* micro sign, U+00B5 ISOnum */ },
209 { 182, "para", /* pilcrow sign = paragraph sign, U+00B6 ISOnum */ },
210 { 183, "middot",/* middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum */ },
211 { 184, "cedil",/* cedilla = spacing cedilla, U+00B8 ISOdia */ },
212 { 185, "sup1", /* superscript one = superscript digit one, U+00B9 ISOnum */ },
213 { 186, "ordm", /* masculine ordinal indicator, U+00BA ISOnum */ },
214 { 187, "raquo",/* right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum */ },
215 { 188, "frac14",/* vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum */ },
216 { 189, "frac12",/* vulgar fraction one half = fraction one half, U+00BD ISOnum */ },
217 { 190, "frac34",/* vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum */ },
218 { 191, "iquest",/* inverted question mark = turned question mark, U+00BF ISOnum */ },
219 { 192, "Agrave",/* latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 */ },
220 { 193, "Aacute",/* latin capital letter A with acute, U+00C1 ISOlat1 */ },
221 { 194, "Acirc",/* latin capital letter A with circumflex, U+00C2 ISOlat1 */ },
222 { 195, "Atilde",/* latin capital letter A with tilde, U+00C3 ISOlat1 */ },
223 { 196, "Auml", /* latin capital letter A with diaeresis, U+00C4 ISOlat1 */ },
224 { 197, "Aring",/* latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 */ },
225 { 198, "AElig",/* latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 */ },
226 { 199, "Ccedil",/* latin capital letter C with cedilla, U+00C7 ISOlat1 */ },
227 { 200, "Egrave",/* latin capital letter E with grave, U+00C8 ISOlat1 */ },
228 { 201, "Eacute",/* latin capital letter E with acute, U+00C9 ISOlat1 */ },
229 { 202, "Ecirc",/* latin capital letter E with circumflex, U+00CA ISOlat1 */ },
230 { 203, "Euml", /* latin capital letter E with diaeresis, U+00CB ISOlat1 */ },
231 { 204, "Igrave",/* latin capital letter I with grave, U+00CC ISOlat1 */ },
232 { 205, "Iacute",/* latin capital letter I with acute, U+00CD ISOlat1 */ },
233 { 206, "Icirc",/* latin capital letter I with circumflex, U+00CE ISOlat1 */ },
234 { 207, "Iuml", /* latin capital letter I with diaeresis, U+00CF ISOlat1 */ },
235 { 208, "ETH", /* latin capital letter ETH, U+00D0 ISOlat1 */ },
236 { 209, "Ntilde",/* latin capital letter N with tilde, U+00D1 ISOlat1 */ },
237 { 210, "Ograve",/* latin capital letter O with grave, U+00D2 ISOlat1 */ },
238 { 211, "Oacute",/* latin capital letter O with acute, U+00D3 ISOlat1 */ },
239 { 212, "Ocirc",/* latin capital letter O with circumflex, U+00D4 ISOlat1 */ },
240 { 213, "Otilde",/* latin capital letter O with tilde, U+00D5 ISOlat1 */ },
241 { 214, "Ouml", /* latin capital letter O with diaeresis, U+00D6 ISOlat1 */ },
242 { 215, "times",/* multiplication sign, U+00D7 ISOnum */ },
243 { 216, "Oslash",/* latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1 */ },
244 { 217, "Ugrave",/* latin capital letter U with grave, U+00D9 ISOlat1 */ },
245 { 218, "Uacute",/* latin capital letter U with acute, U+00DA ISOlat1 */ },
246 { 219, "Ucirc",/* latin capital letter U with circumflex, U+00DB ISOlat1 */ },
247 { 220, "Uuml", /* latin capital letter U with diaeresis, U+00DC ISOlat1 */ },
248 { 221, "Yacute",/* latin capital letter Y with acute, U+00DD ISOlat1 */ },
249 { 222, "THORN",/* latin capital letter THORN, U+00DE ISOlat1 */ },
250 { 223, "szlig",/* latin small letter sharp s = ess-zed, U+00DF ISOlat1 */ },
251 { 224, "agrave",/* latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 */ },
252 { 225, "aacute",/* latin small letter a with acute, U+00E1 ISOlat1 */ },
253 { 226, "acirc",/* latin small letter a with circumflex, U+00E2 ISOlat1 */ },
254 { 227, "atilde",/* latin small letter a with tilde, U+00E3 ISOlat1 */ },
255 { 228, "auml", /* latin small letter a with diaeresis, U+00E4 ISOlat1 */ },
256 { 229, "aring",/* latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 */ },
257 { 230, "aelig",/* latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 */ },
258 { 231, "ccedil",/* latin small letter c with cedilla, U+00E7 ISOlat1 */ },
259 { 232, "egrave",/* latin small letter e with grave, U+00E8 ISOlat1 */ },
260 { 233, "eacute",/* latin small letter e with acute, U+00E9 ISOlat1 */ },
261 { 234, "ecirc",/* latin small letter e with circumflex, U+00EA ISOlat1 */ },
262 { 235, "euml", /* latin small letter e with diaeresis, U+00EB ISOlat1 */ },
263 { 236, "igrave",/* latin small letter i with grave, U+00EC ISOlat1 */ },
264 { 237, "iacute",/* latin small letter i with acute, U+00ED ISOlat1 */ },
265 { 238, "icirc",/* latin small letter i with circumflex, U+00EE ISOlat1 */ },
266 { 239, "iuml", /* latin small letter i with diaeresis, U+00EF ISOlat1 */ },
267 { 240, "eth", /* latin small letter eth, U+00F0 ISOlat1 */ },
268 { 241, "ntilde",/* latin small letter n with tilde, U+00F1 ISOlat1 */ },
269 { 242, "ograve",/* latin small letter o with grave, U+00F2 ISOlat1 */ },
270 { 243, "oacute",/* latin small letter o with acute, U+00F3 ISOlat1 */ },
271 { 244, "ocirc",/* latin small letter o with circumflex, U+00F4 ISOlat1 */ },
272 { 245, "otilde",/* latin small letter o with tilde, U+00F5 ISOlat1 */ },
273 { 246, "ouml", /* latin small letter o with diaeresis, U+00F6 ISOlat1 */ },
274 { 247, "divide",/* division sign, U+00F7 ISOnum */ },
275 { 248, "oslash",/* latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 */ },
276 { 249, "ugrave",/* latin small letter u with grave, U+00F9 ISOlat1 */ },
277 { 250, "uacute",/* latin small letter u with acute, U+00FA ISOlat1 */ },
278 { 251, "ucirc",/* latin small letter u with circumflex, U+00FB ISOlat1 */ },
279 { 252, "uuml", /* latin small letter u with diaeresis, U+00FC ISOlat1 */ },
280 { 253, "yacute",/* latin small letter y with acute, U+00FD ISOlat1 */ },
281 { 254, "thorn",/* latin small letter thorn with, U+00FE ISOlat1 */ },
282 { 255, "yuml", /* latin small letter y with diaeresis, U+00FF ISOlat1 */ },
283
284 /*
285 * Anything below should really be kept as entities references
286 */
287 { 402, "fnof", /* latin small f with hook = function = florin, U+0192 ISOtech */ },
288
289 { 913, "Alpha",/* greek capital letter alpha, U+0391 */ },
290 { 914, "Beta", /* greek capital letter beta, U+0392 */ },
291 { 915, "Gamma",/* greek capital letter gamma, U+0393 ISOgrk3 */ },
292 { 916, "Delta",/* greek capital letter delta, U+0394 ISOgrk3 */ },
293 { 917, "Epsilon",/* greek capital letter epsilon, U+0395 */ },
294 { 918, "Zeta", /* greek capital letter zeta, U+0396 */ },
295 { 919, "Eta", /* greek capital letter eta, U+0397 */ },
296 { 920, "Theta",/* greek capital letter theta, U+0398 ISOgrk3 */ },
297 { 921, "Iota", /* greek capital letter iota, U+0399 */ },
298 { 922, "Kappa",/* greek capital letter kappa, U+039A */ },
299 { 923, "Lambda"/* greek capital letter lambda, U+039B ISOgrk3 */ },
300 { 924, "Mu", /* greek capital letter mu, U+039C */ },
301 { 925, "Nu", /* greek capital letter nu, U+039D */ },
302 { 926, "Xi", /* greek capital letter xi, U+039E ISOgrk3 */ },
303 { 927, "Omicron",/* greek capital letter omicron, U+039F */ },
304 { 928, "Pi", /* greek capital letter pi, U+03A0 ISOgrk3 */ },
305 { 929, "Rho", /* greek capital letter rho, U+03A1 */ },
306 { 931, "Sigma",/* greek capital letter sigma, U+03A3 ISOgrk3 */ },
307 { 932, "Tau", /* greek capital letter tau, U+03A4 */ },
308 { 933, "Upsilon",/* greek capital letter upsilon, U+03A5 ISOgrk3 */ },
309 { 934, "Phi", /* greek capital letter phi, U+03A6 ISOgrk3 */ },
310 { 935, "Chi", /* greek capital letter chi, U+03A7 */ },
311 { 936, "Psi", /* greek capital letter psi, U+03A8 ISOgrk3 */ },
312 { 937, "Omega",/* greek capital letter omega, U+03A9 ISOgrk3 */ },
313
314 { 945, "alpha",/* greek small letter alpha, U+03B1 ISOgrk3 */ },
315 { 946, "beta", /* greek small letter beta, U+03B2 ISOgrk3 */ },
316 { 947, "gamma",/* greek small letter gamma, U+03B3 ISOgrk3 */ },
317 { 948, "delta",/* greek small letter delta, U+03B4 ISOgrk3 */ },
318 { 949, "epsilon",/* greek small letter epsilon, U+03B5 ISOgrk3 */ },
319 { 950, "zeta", /* greek small letter zeta, U+03B6 ISOgrk3 */ },
320 { 951, "eta", /* greek small letter eta, U+03B7 ISOgrk3 */ },
321 { 952, "theta",/* greek small letter theta, U+03B8 ISOgrk3 */ },
322 { 953, "iota", /* greek small letter iota, U+03B9 ISOgrk3 */ },
323 { 954, "kappa",/* greek small letter kappa, U+03BA ISOgrk3 */ },
324 { 955, "lambda",/* greek small letter lambda, U+03BB ISOgrk3 */ },
325 { 956, "mu", /* greek small letter mu, U+03BC ISOgrk3 */ },
326 { 957, "nu", /* greek small letter nu, U+03BD ISOgrk3 */ },
327 { 958, "xi", /* greek small letter xi, U+03BE ISOgrk3 */ },
328 { 959, "omicron",/* greek small letter omicron, U+03BF NEW */ },
329 { 960, "pi", /* greek small letter pi, U+03C0 ISOgrk3 */ },
330 { 961, "rho", /* greek small letter rho, U+03C1 ISOgrk3 */ },
331 { 962, "sigmaf",/* greek small letter final sigma, U+03C2 ISOgrk3 */ },
332 { 963, "sigma",/* greek small letter sigma, U+03C3 ISOgrk3 */ },
333 { 964, "tau", /* greek small letter tau, U+03C4 ISOgrk3 */ },
334 { 965, "upsilon",/* greek small letter upsilon, U+03C5 ISOgrk3 */ },
335 { 966, "phi", /* greek small letter phi, U+03C6 ISOgrk3 */ },
336 { 967, "chi", /* greek small letter chi, U+03C7 ISOgrk3 */ },
337 { 968, "psi", /* greek small letter psi, U+03C8 ISOgrk3 */ },
338 { 969, "omega",/* greek small letter omega, U+03C9 ISOgrk3 */ },
339 { 977, "thetasym",/* greek small letter theta symbol, U+03D1 NEW */ },
340 { 978, "upsih",/* greek upsilon with hook symbol, U+03D2 NEW */ },
341 { 982, "piv", /* greek pi symbol, U+03D6 ISOgrk3 */ },
342
343 { 8226, "bull", /* bullet = black small circle, U+2022 ISOpub */ },
344 { 8230, "hellip",/* horizontal ellipsis = three dot leader, U+2026 ISOpub */ },
345 { 8242, "prime",/* prime = minutes = feet, U+2032 ISOtech */ },
346 { 8243, "Prime",/* double prime = seconds = inches, U+2033 ISOtech */ },
347 { 8254, "oline",/* overline = spacing overscore, U+203E NEW */ },
348 { 8260, "frasl",/* fraction slash, U+2044 NEW */ },
349
350 { 8472, "weierp",/* script capital P = power set = Weierstrass p, U+2118 ISOamso */ },
351 { 8465, "image",/* blackletter capital I = imaginary part, U+2111 ISOamso */ },
352 { 8476, "real", /* blackletter capital R = real part symbol, U+211C ISOamso */ },
353 { 8482, "trade",/* trade mark sign, U+2122 ISOnum */ },
354 { 8501, "alefsym",/* alef symbol = first transfinite cardinal, U+2135 NEW */ },
355 { 8592, "larr", /* leftwards arrow, U+2190 ISOnum */ },
356 { 8593, "uarr", /* upwards arrow, U+2191 ISOnum */ },
357 { 8594, "rarr", /* rightwards arrow, U+2192 ISOnum */ },
358 { 8595, "darr", /* downwards arrow, U+2193 ISOnum */ },
359 { 8596, "harr", /* left right arrow, U+2194 ISOamsa */ },
360 { 8629, "crarr",/* downwards arrow with corner leftwards = carriage return, U+21B5 NEW */ },
361 { 8656, "lArr", /* leftwards double arrow, U+21D0 ISOtech */ },
362 { 8657, "uArr", /* upwards double arrow, U+21D1 ISOamsa */ },
363 { 8658, "rArr", /* rightwards double arrow, U+21D2 ISOtech */ },
364 { 8659, "dArr", /* downwards double arrow, U+21D3 ISOamsa */ },
365 { 8660, "hArr", /* left right double arrow, U+21D4 ISOamsa */ },
366
367 { 8704, "forall",/* for all, U+2200 ISOtech */ },
368 { 8706, "part", /* partial differential, U+2202 ISOtech */ },
369 { 8707, "exist",/* there exists, U+2203 ISOtech */ },
370 { 8709, "empty",/* empty set = null set = diameter, U+2205 ISOamso */ },
371 { 8711, "nabla",/* nabla = backward difference, U+2207 ISOtech */ },
372 { 8712, "isin", /* element of, U+2208 ISOtech */ },
373 { 8713, "notin",/* not an element of, U+2209 ISOtech */ },
374 { 8715, "ni", /* contains as member, U+220B ISOtech */ },
375 { 8719, "prod", /* n-ary product = product sign, U+220F ISOamsb */ },
376 { 8721, "sum", /* n-ary sumation, U+2211 ISOamsb */ },
377 { 8722, "minus",/* minus sign, U+2212 ISOtech */ },
378 { 8727, "lowast",/* asterisk operator, U+2217 ISOtech */ },
379 { 8730, "radic",/* square root = radical sign, U+221A ISOtech */ },
380 { 8733, "prop", /* proportional to, U+221D ISOtech */ },
381 { 8734, "infin",/* infinity, U+221E ISOtech */ },
382 { 8736, "ang", /* angle, U+2220 ISOamso */ },
383 { 8743, "and", /* logical and = wedge, U+2227 ISOtech */ },
384 { 8744, "or", /* logical or = vee, U+2228 ISOtech */ },
385 { 8745, "cap", /* intersection = cap, U+2229 ISOtech */ },
386 { 8746, "cup", /* union = cup, U+222A ISOtech */ },
387 { 8747, "int", /* integral, U+222B ISOtech */ },
388 { 8756, "there4",/* therefore, U+2234 ISOtech */ },
389 { 8764, "sim", /* tilde operator = varies with = similar to, U+223C ISOtech */ },
390 { 8773, "cong", /* approximately equal to, U+2245 ISOtech */ },
391 { 8776, "asymp",/* almost equal to = asymptotic to, U+2248 ISOamsr */ },
392 { 8800, "ne", /* not equal to, U+2260 ISOtech */ },
393 { 8801, "equiv",/* identical to, U+2261 ISOtech */ },
394 { 8804, "le", /* less-than or equal to, U+2264 ISOtech */ },
395 { 8805, "ge", /* greater-than or equal to, U+2265 ISOtech */ },
396 { 8834, "sub", /* subset of, U+2282 ISOtech */ },
397 { 8835, "sup", /* superset of, U+2283 ISOtech */ },
398 { 8836, "nsub", /* not a subset of, U+2284 ISOamsn */ },
399 { 8838, "sube", /* subset of or equal to, U+2286 ISOtech */ },
400 { 8839, "supe", /* superset of or equal to, U+2287 ISOtech */ },
401 { 8853, "oplus",/* circled plus = direct sum, U+2295 ISOamsb */ },
402 { 8855, "otimes",/* circled times = vector product, U+2297 ISOamsb */ },
403 { 8869, "perp", /* up tack = orthogonal to = perpendicular, U+22A5 ISOtech */ },
404 { 8901, "sdot", /* dot operator, U+22C5 ISOamsb */ },
405 { 8968, "lceil",/* left ceiling = apl upstile, U+2308 ISOamsc */ },
406 { 8969, "rceil",/* right ceiling, U+2309 ISOamsc */ },
407 { 8970, "lfloor",/* left floor = apl downstile, U+230A ISOamsc */ },
408 { 8971, "rfloor",/* right floor, U+230B ISOamsc */ },
409 { 9001, "lang", /* left-pointing angle bracket = bra, U+2329 ISOtech */ },
410 { 9002, "rang", /* right-pointing angle bracket = ket, U+232A ISOtech */ },
411 { 9674, "loz", /* lozenge, U+25CA ISOpub */ },
412
413 { 9824, "spades",/* black spade suit, U+2660 ISOpub */ },
414 { 9827, "clubs",/* black club suit = shamrock, U+2663 ISOpub */ },
415 { 9829, "hearts",/* black heart suit = valentine, U+2665 ISOpub */ },
416 { 9830, "diams",/* black diamond suit, U+2666 ISOpub */ },
417
418 { 338, "OElig",/* latin capital ligature OE, U+0152 ISOlat2 */ },
419 { 339, "oelig",/* latin small ligature oe, U+0153 ISOlat2 */ },
420 { 352, "Scaron",/* latin capital letter S with caron, U+0160 ISOlat2 */ },
421 { 353, "scaron",/* latin small letter s with caron, U+0161 ISOlat2 */ },
422 { 376, "Yuml", /* latin capital letter Y with diaeresis, U+0178 ISOlat2 */ },
423 { 710, "circ", /* modifier letter circumflex accent, U+02C6 ISOpub */ },
424 { 732, "tilde",/* small tilde, U+02DC ISOdia */ },
425
426 { 8194, "ensp", /* en space, U+2002 ISOpub */ },
427 { 8195, "emsp", /* em space, U+2003 ISOpub */ },
428 { 8201, "thinsp",/* thin space, U+2009 ISOpub */ },
429 { 8204, "zwnj", /* zero width non-joiner, U+200C NEW RFC 2070 */ },
430 { 8205, "zwj", /* zero width joiner, U+200D NEW RFC 2070 */ },
431 { 8206, "lrm", /* left-to-right mark, U+200E NEW RFC 2070 */ },
432 { 8207, "rlm", /* right-to-left mark, U+200F NEW RFC 2070 */ },
433 { 8211, "ndash",/* en dash, U+2013 ISOpub */ },
434 { 8212, "mdash",/* em dash, U+2014 ISOpub */ },
435 { 8216, "lsquo",/* left single quotation mark, U+2018 ISOnum */ },
436 { 8217, "rsquo",/* right single quotation mark, U+2019 ISOnum */ },
437 { 8218, "sbquo",/* single low-9 quotation mark, U+201A NEW */ },
438 { 8220, "ldquo",/* left double quotation mark, U+201C ISOnum */ },
439 { 8221, "rdquo",/* right double quotation mark, U+201D ISOnum */ },
440 { 8222, "bdquo",/* double low-9 quotation mark, U+201E NEW */ },
441 { 8224, "dagger",/* dagger, U+2020 ISOpub */ },
442 { 8225, "Dagger",/* double dagger, U+2021 ISOpub */ },
443 { 8240, "permil",/* per mille sign, U+2030 ISOtech */ },
444 { 8249, "lsaquo",/* single left-pointing angle quotation mark, U+2039 ISO proposed */ },
445 { 8250, "rsaquo",/* single right-pointing angle quotation mark, U+203A ISO proposed */ },
446 { 8364, "euro", /* euro sign, U+20AC NEW */ }
447 };
448
449 static GHashTable *entities;
450
451 /* this cannot be called in a thread context */
tokenize_setup(void)452 static void tokenize_setup (void)
453 {
454 gint i;
455
456 if (entities == NULL) {
457 entities = g_hash_table_new (g_str_hash, g_str_equal);
458 for (i = 0; i < G_N_ELEMENTS (entity_map); i++) {
459 g_hash_table_insert (entities, (gchar *) entity_map[i].name, GUINT_TO_POINTER (entity_map[i].val));
460 }
461 }
462 }
463
tokenize_init(void)464 static CamelHTMLParserPrivate *tokenize_init (void)
465 {
466 CamelHTMLParserPrivate *p;
467
468 p = g_malloc (sizeof (*p));
469 p->state = CAMEL_HTML_PARSER_DATA;
470
471 p->attr = 0;
472 p->attrs = g_ptr_array_new ();
473 p->values = g_ptr_array_new ();
474 p->tag = g_string_new ("");
475 p->ent = g_string_new ("");
476 p->charset = NULL;
477
478 if (entities == NULL)
479 tokenize_setup ();
480
481 return p;
482 }
483
tokenize_free(CamelHTMLParserPrivate * p)484 static void tokenize_free (CamelHTMLParserPrivate *p)
485 {
486 gint i;
487
488 g_string_free (p->tag, TRUE);
489 g_string_free (p->ent, TRUE);
490 g_free (p->charset);
491
492 for (i = 0; i < p->attrs->len; i++)
493 g_string_free (p->attrs->pdata[i], TRUE);
494
495 for (i = 0; i < p->values->len; i++)
496 g_string_free (p->values->pdata[i], TRUE);
497
498 g_free (p);
499 }
500
convert_entity(const gchar * e,gchar * ent)501 static gint convert_entity (const gchar *e, gchar *ent)
502 {
503 guint val;
504
505 if (e[0] == '#')
506 return g_unichar_to_utf8 (atoi (e + 1), ent);
507
508 val = GPOINTER_TO_UINT (g_hash_table_lookup (entities, e));
509 if (ent)
510 return g_unichar_to_utf8 (val, ent);
511 else
512 return 0;
513 }
514
515 #if 0
516 static void dump_tag (CamelHTMLParserPrivate *p)
517 {
518 gint i;
519
520 printf ("got tag: %s\n", p->tag->str);
521 printf ("%d attributes:\n", p->attr);
522 for (i = 0; i < p->attr; i++) {
523 printf (" %s = '%s'\n", ((GString *) p->attrs->pdata[i])->str, ((GString *) p->values->pdata[i])->str);
524 }
525 }
526 #endif
527
tokenize_step(CamelHTMLParserPrivate * p,gchar ** datap,gint * lenp)528 static gint tokenize_step (CamelHTMLParserPrivate *p, gchar **datap, gint *lenp)
529 {
530 gchar *in = p->inptr;
531 gchar *inend = p->inend;
532 gchar c;
533 gint state = p->state, ret, len;
534 gchar *start = p->inptr;
535
536 d (printf ("Tokenise step\n"));
537
538 while (in < inend) {
539 c = *in++;
540 switch (state) {
541 case CAMEL_HTML_PARSER_DATA:
542 if (c == '<') {
543 ret = state;
544 state = CAMEL_HTML_PARSER_TAG;
545 p->attr = 0;
546 g_string_truncate (p->tag, 0);
547 d (printf ("got data '%.*s'\n", in - start - 1, start));
548 *datap = start;
549 *lenp = in-start-1;
550 goto done;
551 } else if (c == '&') {
552 ret = state;
553 state = CAMEL_HTML_PARSER_ENT;
554 g_string_truncate (p->ent, 0);
555 g_string_append_c (p->ent, c);
556 d (printf ("got data '%.*s'\n", in - start - 1, start));
557 *datap = start;
558 *lenp = in-start-1;
559 goto done;
560 }
561 break;
562 case CAMEL_HTML_PARSER_ENT:
563 if (c == ';') {
564 len = convert_entity (p->ent->str + 1, p->ent_utf8);
565 if (len == 0) {
566 /* handle broken entity */
567 g_string_append_c (p->ent, c);
568 ret = state = CAMEL_HTML_PARSER_DATA;
569 *datap = p->ent->str;
570 *lenp = p->ent->len;
571 goto done;
572 } else {
573 d (printf ("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
574 ret = state;
575 state = CAMEL_HTML_PARSER_DATA;
576 *datap = p->ent_utf8;
577 *lenp = len;
578 goto done;
579 }
580 } else if (isalnum (c) || c=='#') { /* FIXME: right type */
581 g_string_append_c (p->ent, c);
582 } else {
583 /* handle broken entity */
584 g_string_append_c (p->ent, c);
585 ret = state = CAMEL_HTML_PARSER_DATA;
586 *datap = p->ent->str;
587 *lenp = p->ent->len;
588 goto done;
589 }
590 break;
591 case CAMEL_HTML_PARSER_TAG:
592 if (c == '!') {
593 state = CAMEL_HTML_PARSER_COMMENT0;
594 g_string_append_c (p->tag, c);
595 } else if (c == '>') {
596 d (dump_tag (p));
597 ret = CAMEL_HTML_PARSER_ELEMENT;
598 state = CAMEL_HTML_PARSER_DATA;
599 goto done;
600 } else if (c == ' ' || c == '\n' || c == '\t') {
601 state = CAMEL_HTML_PARSER_ATTR0;
602 } else {
603 g_string_append_c (p->tag, c);
604 }
605 break;
606 /* check for <!-- */
607 case CAMEL_HTML_PARSER_COMMENT0:
608 if (c == '-') {
609 g_string_append_c (p->tag, c);
610 if (p->tag->len == 3) {
611 g_string_truncate (p->tag, 0);
612 state = CAMEL_HTML_PARSER_COMMENT;
613 }
614 } else {
615 /* got something else, probbly dtd entity */
616 state = CAMEL_HTML_PARSER_DTDENT;
617 }
618 break;
619 case CAMEL_HTML_PARSER_DTDENT:
620 if (c == '>') {
621 ret = CAMEL_HTML_PARSER_DTDENT;
622 state = CAMEL_HTML_PARSER_DATA;
623 *datap = start;
624 *lenp = in-start-1;
625 goto done;
626 }
627 break;
628 case CAMEL_HTML_PARSER_COMMENT:
629 if (c == '>' && p->tag->len == 2) {
630 ret = CAMEL_HTML_PARSER_COMMENT;
631 state = CAMEL_HTML_PARSER_DATA;
632 *datap = start;
633 *lenp = in-start-1;
634 goto done;
635 } else if (c == '-') {
636 /* we dont care if we get 'n' --'s before the > */
637 if (p->tag->len < 2)
638 g_string_append_c (p->tag, c);
639 } else {
640 g_string_truncate (p->tag, 0);
641 }
642 break;
643 case CAMEL_HTML_PARSER_ATTR0: /* pre-attribute whitespace */
644 if (c == '>') {
645 d (dump_tag (p));
646 ret = CAMEL_HTML_PARSER_ELEMENT;
647 state = CAMEL_HTML_PARSER_DATA;
648 goto done;
649 } else if (c == ' ' || c == '\n' || c == '\t') {
650 } else {
651 if (p->attrs->len <= p->attr) {
652 g_ptr_array_add (p->attrs, g_string_new (""));
653 g_ptr_array_add (p->values, g_string_new (""));
654 } else {
655 g_string_truncate (p->attrs->pdata[p->attr], 0);
656 g_string_truncate (p->values->pdata[p->attr], 0);
657 }
658 g_string_append_c (p->attrs->pdata[p->attr], c);
659 state = CAMEL_HTML_PARSER_ATTR;
660 }
661 break;
662 case CAMEL_HTML_PARSER_ATTR:
663 if (c == '>') {
664 d (dump_tag (p));
665 ret = CAMEL_HTML_PARSER_ELEMENT;
666 state = CAMEL_HTML_PARSER_DATA;
667 goto done;
668 } else if (c == '=') {
669 state = CAMEL_HTML_PARSER_VAL0;
670 } else if (c == ' ' || c == '\n' || c == '\t') {
671 state = CAMEL_HTML_PARSER_ATTR0;
672 p->attr++;
673 } else {
674 g_string_append_c (p->attrs->pdata[p->attr], c);
675 }
676 break;
677 case CAMEL_HTML_PARSER_VAL0:
678 if (c == '>') {
679 d (printf ("value truncated\n"));
680 d (dump_tag (p));
681 ret = CAMEL_HTML_PARSER_ELEMENT;
682 state = CAMEL_HTML_PARSER_DATA;
683 goto done;
684 } else if (c == '\'' || c == '\"') {
685 p->quote = c;
686 state = CAMEL_HTML_PARSER_VAL;
687 } else if (c == ' ' || c == '\n' || c == '\t') {
688 } else {
689 g_string_append_c (p->values->pdata[p->attr], c);
690 p->quote = 0;
691 state = CAMEL_HTML_PARSER_VAL;
692 }
693 break;
694 case CAMEL_HTML_PARSER_VAL:
695 do_val:
696 if (p->quote) {
697 if (c == '>') {
698 d (printf ("value truncated\n"));
699 d (dump_tag (p));
700 ret = CAMEL_HTML_PARSER_ELEMENT;
701 state = CAMEL_HTML_PARSER_DATA;
702 p->attr++;
703 goto done;
704 } else if (c == p->quote) {
705 state = CAMEL_HTML_PARSER_ATTR0;
706 p->attr++;
707 } else if (c == '&') {
708 state = CAMEL_HTML_PARSER_VAL_ENT;
709 g_string_truncate (p->ent, 0);
710 } else {
711 g_string_append_c (p->values->pdata[p->attr], c);
712 }
713 } else if (c == '>') {
714 d (dump_tag (p));
715 ret = CAMEL_HTML_PARSER_ELEMENT;
716 state = CAMEL_HTML_PARSER_DATA;
717 p->attr++;
718 goto done;
719 } else if (c == ' ' || c == '\n' || c == '\t') {
720 state = CAMEL_HTML_PARSER_ATTR0;
721 p->attr++;
722 } else if (c == '&') {
723 state = CAMEL_HTML_PARSER_VAL_ENT;
724 g_string_truncate (p->ent, 0);
725 } else {
726 g_string_append_c (p->values->pdata[p->attr], c);
727 }
728 break;
729 case CAMEL_HTML_PARSER_VAL_ENT:
730 if (c == ';') {
731 state = CAMEL_HTML_PARSER_VAL;
732 len = convert_entity (p->ent->str + 1, p->ent_utf8);
733 if (len == 0) {
734 /* fallback; broken entity, just output it and see why we ended */
735 g_string_append (p->values->pdata[p->attr], p->ent->str);
736 g_string_append_c (p->values->pdata[p->attr], ';');
737 } else {
738 d (printf ("got entity: %s = %s\n", p->ent->str, p->ent_utf8));
739 g_string_append_len (p->values->pdata[p->attr], p->ent_utf8, len);
740 }
741 } else if (isalnum (c) || c=='#') { /* FIXME: right type */
742 g_string_append_c (p->ent, c);
743 } else {
744 /* fallback; broken entity, just output it and see why we ended */
745 g_string_append (p->values->pdata[p->attr], p->ent->str);
746 goto do_val;
747 }
748 break;
749 }
750 }
751
752 if (p->eof) {
753 /* FIXME: what about other truncated states? */
754 switch (state) {
755 case CAMEL_HTML_PARSER_DATA:
756 case CAMEL_HTML_PARSER_COMMENT:
757 if (in > start) {
758 ret = state;
759 *datap = start;
760 *lenp = in-start-1;
761 } else {
762 ret = CAMEL_HTML_PARSER_EOF;
763 state = CAMEL_HTML_PARSER_EOF;
764 }
765 break;
766 default:
767 ret = CAMEL_HTML_PARSER_EOF;
768 state = CAMEL_HTML_PARSER_EOF;
769 }
770 } else {
771 /* we only care about remaining data for this buffer, everything else has its own copy */
772 switch (state) {
773 case CAMEL_HTML_PARSER_DATA:
774 case CAMEL_HTML_PARSER_COMMENT:
775 if (in > start) {
776 ret = state;
777 *datap = start;
778 *lenp = in-start-1;
779 } else {
780 ret = CAMEL_HTML_PARSER_EOD;
781 }
782 break;
783 default:
784 ret = CAMEL_HTML_PARSER_EOD;
785 }
786 }
787
788 done:
789 p->start = start;
790 p->state = state;
791 p->inptr = in;
792
793 return ret;
794 }
795