1 /*
2 * xml_tok.c -- pull based xml tokenizer
3 *
4 * Copyright (C) 2002 �yvind Kol�s <pippin@users.sourceforge.net>
5 *
6 * This program is free software; you can redistribute it and/or modify it under
7 * the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2, or (at your option) any later
9 * version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59
18 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 */
20
21 #include <string.h>
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <ctype.h>
25 #include "xml_tok.h"
26
27 enum {
28 s_null = 0,
29 s_start,
30 s_tag,
31 s_tagnamestart,
32 s_tagname,
33 s_tagnamedone,
34 s_intag,
35 s_attstart,
36 s_attname,
37 s_attdone,
38 s_att,
39 s_atteq,
40 s_eqquot,
41 s_eqvalstart,
42 s_eqapos,
43 s_eqaposval,
44 s_eqaposvaldone,
45 s_eqval,
46 s_eqvaldone,
47 s_eqquotval,
48 s_eqquotvaldone,
49 s_tagend,
50 s_empty,
51 s_emptyend,
52 s_whitespace,
53 s_whitespacedone,
54 s_entitystart,
55 s_entity,
56 s_entitydone,
57 s_word,
58 s_worddone,
59 s_tagclose,
60 s_tagclosenamestart,
61 s_tagclosename,
62 s_tagclosedone,
63 s_tagexcl,
64 s_commentdash1,
65 s_commentdash2,
66 s_incomment,
67 s_commentenddash1,
68 s_commentenddash2,
69 s_commentdone,
70 s_dtd,
71 s_prolog,
72 s_prologq,
73 s_prologdone,
74 s_eof,
75 s_error
76 };
77
78
79 char *c_ws = " \n\r\t";
80
81 enum {
82 c_nil = 0,
83 c_eat = 1, /* request that another char be used for the next state */
84 c_store = 2 /* store the current char in the output buffer */
85 };
86
87 typedef struct {
88 int state;
89 char *chars;
90 unsigned char r_start;
91 unsigned char r_end;
92 int next_state;
93 int resetbuf;
94 int charhandling;
95 int return_type; /* if set return current buf, with type set to the type */
96 } state_entry;
97
98 #define max_entries 20
99
100 static state_entry state_table[s_error][max_entries];
101
a(int state,char * chars,unsigned char r_start,unsigned char r_end,int charhandling,int next_state)102 static void a (int state, char *chars, unsigned char r_start,
103 unsigned char r_end, int charhandling, int next_state)
104 {
105 int no = 0;
106
107 while (state_table[state][no].state != s_null)
108 no++;
109 state_table[state][no].state = state;
110 state_table[state][no].r_start = r_start;
111 if (chars)
112 state_table[state][no].chars = strdup (chars);
113 state_table[state][no].r_end = r_end;
114 state_table[state][no].charhandling = charhandling;
115 state_table[state][no].next_state = next_state;
116 }
117
r(int state,int return_type,int next_state)118 static void r (int state, int return_type, int next_state)
119 {
120 state_table[state][0].state = state;
121 state_table[state][0].return_type = return_type;
122 state_table[state][0].next_state = next_state;
123 }
124
125 /* *INDENT-OFF* */
126
init_statetable(void)127 void init_statetable(void){
128 static int inited=0;
129 if(inited)return;
130 inited=1;
131 memset(state_table,0,sizeof(state_table));
132 a(s_start, "<", 0,0, c_eat, s_tag);
133 a(s_start, c_ws, 0,0, c_eat+c_store, s_whitespace);
134 a(s_start, "&", 0,0, c_eat, s_entitystart);
135 a(s_start, NULL, 0,255, c_eat+c_store, s_word);
136
137 a(s_tag, c_ws, 0,0, c_eat, s_tag);
138 a(s_tag, "/", 0,0, c_eat, s_tagclose);
139 a(s_tag, "!", 0,0, c_eat, s_tagexcl);
140 a(s_tag, "?", 0,0, c_eat, s_prolog);
141 a(s_tag, NULL, 0,255, c_eat+c_store, s_tagnamestart);
142
143 a(s_tagclose, NULL, 0,255, c_eat+c_store, s_tagclosenamestart);
144 a(s_tagclosenamestart, ">", 0,0, c_eat, s_tagclosedone);
145 a(s_tagclosenamestart, NULL, 0,255, c_eat+c_store, s_tagclosename);
146 a(s_tagclosename, ">", 0,0, c_eat, s_tagclosedone);
147 a(s_tagclosename, NULL, 0,255, c_eat+c_store, s_tagclosename);
148 r(s_tagclosedone, t_closetag, s_start);
149
150 a(s_whitespace, c_ws, 0,0, c_eat+c_store, s_whitespace);
151 a(s_whitespace, NULL, 0,255, c_nil, s_whitespacedone);
152 r(s_whitespacedone, t_whitespace, s_start);
153
154 a(s_entitystart,";", 0,0, c_eat, s_entitydone);
155 a(s_entitystart,NULL, 0,255, c_eat+c_store, s_entity);
156 a(s_entity, ";", 0,0, c_eat, s_entitydone);
157 a(s_entity,NULL, 0,255, c_eat+c_store, s_entity);
158 r(s_entitydone, t_entity, s_start);
159
160 a(s_word, c_ws, 0,0, c_nil, s_worddone);
161 a(s_word, "<&", 0,0, c_nil, s_worddone);
162 a(s_word, NULL, 0,255, c_eat+c_store, s_word);
163 r(s_worddone, t_word, s_start);
164
165 a(s_tagnamestart,c_ws, 0,0, c_nil, s_tagnamedone);
166 a(s_tagnamestart, "/>", 0,0, c_nil, s_tagnamedone);
167 a(s_tagnamestart,NULL, 0,255, c_eat+c_store, s_tagname);
168 a(s_tagname, c_ws, 0,0, c_nil, s_tagnamedone);
169 a(s_tagname, "/>", 0,0, c_nil, s_tagnamedone);
170 a(s_tagname, NULL, 0,255, c_eat+c_store, s_tagname);
171 r(s_tagnamedone, t_tag, s_intag);
172
173 a(s_intag, c_ws, 0,0, c_eat, s_intag);
174 a(s_intag, ">", 0,0, c_eat, s_tagend);
175 a(s_intag, "/", 0,0, c_eat, s_empty);
176 a(s_intag, NULL, 0,255, c_eat+c_store, s_attstart);
177
178 a(s_attstart, c_ws, 0,0, c_eat, s_attdone);
179 a(s_attstart, "=/>", 0,0, c_nil, s_attdone);
180 a(s_attstart, NULL, 0,255, c_eat+c_store, s_attname);
181 a(s_attname, "=/>", 0,0, c_nil, s_attdone);
182 a(s_attname, c_ws, 0,0, c_eat, s_attdone);
183 a(s_attname, NULL, 0,255, c_eat+c_store, s_attname);
184 r(s_attdone, t_att, s_att);
185 a(s_att, c_ws, 0,0, c_eat, s_att);
186 a(s_att, "=", 0,0, c_eat, s_atteq);
187 a(s_att, NULL, 0,255, c_eat, s_intag);
188 a(s_atteq, "'", 0,0, c_eat, s_eqapos);
189 a(s_atteq, "\"", 0,0, c_eat, s_eqquot);
190 a(s_atteq, c_ws, 0,0, c_eat, s_atteq);
191 a(s_atteq, NULL, 0,255, c_nil, s_eqval);
192
193 a(s_eqapos, "'", 0,0, c_eat, s_eqaposvaldone);
194 a(s_eqapos, NULL, 0,255, c_eat+c_store, s_eqaposval);
195 a(s_eqaposval, "'", 0,0, c_eat, s_eqaposvaldone);
196 a(s_eqaposval, NULL, 0,255, c_eat+c_store, s_eqaposval);
197 r(s_eqaposvaldone, t_val, s_intag);
198
199 a(s_eqquot, "\"", 0,0, c_eat, s_eqquotvaldone);
200 a(s_eqquot, NULL, 0,255, c_eat+c_store, s_eqquotval);
201 a(s_eqquotval, "\"", 0,0, c_eat, s_eqquotvaldone);
202 a(s_eqquotval, NULL, 0,255, c_eat+c_store, s_eqquotval);
203 r(s_eqquotvaldone, t_val, s_intag);
204
205 a(s_eqval, c_ws, 0,0, c_nil, s_eqvaldone);
206 a(s_eqval, "/>", 0,0, c_nil, s_eqvaldone);
207 a(s_eqval, NULL, 0,255, c_eat+c_store, s_eqval);
208
209 r(s_eqvaldone, t_val, s_intag);
210
211 r(s_tagend, t_endtag, s_start);
212
213 a(s_empty, ">",0,0, c_eat, s_emptyend);
214 a(s_empty, NULL,0,255, c_eat, s_empty);
215 r(s_emptyend, t_closeemptytag, s_start);
216
217 a(s_prolog, "?",0,0, c_eat, s_prologq);
218 a(s_prolog, NULL,0,255, c_eat+c_store, s_prolog);
219
220 a(s_prologq, ">",0,0, c_eat, s_prologdone);
221 a(s_prologq, NULL,0,255, c_eat+c_store, s_prolog);
222 r(s_prologdone, t_prolog, s_start);
223
224 a(s_tagexcl, "-",0,0, c_eat, s_commentdash1);
225 a(s_tagexcl, "D",0,0, c_nil, s_dtd);
226 a(s_tagexcl, NULL,0,255, c_eat, s_start);
227
228 a(s_commentdash1, "-",0,0, c_eat, s_commentdash2);
229 a(s_commentdash1, NULL,0,255, c_eat, s_error);
230
231 a(s_commentdash2, "-",0,0, c_eat, s_commentenddash1);
232 a(s_commentdash2, NULL,0,255, c_eat+c_store, s_incomment);
233
234 a(s_incomment , "-",0,0, c_eat, s_commentenddash1);
235 a(s_incomment , NULL,0,255, c_eat+c_store, s_incomment);
236
237 a(s_commentenddash1, "-",0,0, c_eat, s_commentenddash2);
238 a(s_commentenddash1, NULL,0,255, c_eat+c_store, s_incomment);
239
240 a(s_commentenddash2, ">",0,0, c_eat, s_commentdone);
241 a(s_commentenddash2, NULL,0,255, c_eat+c_store, s_incomment);
242
243 r(s_commentdone, t_comment, s_start);
244
245 }
246
247 /* *INDENT-ON* */
248
is_oneof(char c,char * chars)249 static int is_oneof (char c, char *chars)
250 {
251 while (*chars) {
252 if (c == *chars)
253 return 1;
254 chars++;
255 }
256 return 0;
257 }
258
nextchar(xml_tok_state * t)259 static int nextchar (xml_tok_state * t)
260 {
261 int ret;
262
263 if(! (t->inbufpos<t->inbuflen) ){
264 t->inbuflen= fread(t->inbuf,1,inbufsize,t->file_in);
265 t->inbufpos=0;
266 if(!t->inbuflen)
267 return -1;
268 }
269
270 ret=(int) t->inbuf[t->inbufpos++];
271
272 if(ret=='\n')
273 t->line_no++;
274
275 return ret;
276 }
277
xml_tok_get(xml_tok_state * t,char ** data)278 int xml_tok_get (xml_tok_state * t, char **data)
279 {
280 int rbuflen = 0;
281 state_entry *s;
282
283 init_statetable ();
284 t->rbuf[rbuflen] = 0;
285 while (2 + 2 == 4) {
286 if (!t->c_held) {
287 t->c = nextchar (t);
288 if (t->c == -1)
289 return t_eof;
290 t->c_held = 1;
291 }
292 if (t->state == s_dtd) { /* FIXME: should make better code for skipping DTD */
293 /* int angle = 0;*/
294 int squote = 0;
295 int dquote = 0;
296 int abracket = 1;
297
298 /* int sbracket = 0;*/
299
300 t->rbuf[rbuflen++] = t->c;
301 t->rbuf[rbuflen] = 0;
302
303 while (abracket) {
304 switch (t->c = nextchar (t)) {
305 case -1:
306 return t_eof;
307 case '<':
308 if ((!squote) && (!dquote))
309 abracket++;
310 break;
311 case '>':
312 if ((!squote) && (!dquote))
313 abracket--;
314 break;
315 case '"':
316 case '\'':
317 case '[':
318 case ']':
319 default:
320 break;
321 }
322 t->rbuf[rbuflen++] = t->c;
323 t->rbuf[rbuflen] = 0;
324 }
325 t->c_held = 0;
326 t->state = s_start;
327 t->rbuf[--rbuflen] = 0;
328
329 return t_dtd;
330 }
331 s = &state_table[t->state][0];
332 while (s->state) {
333 if (s->return_type != t_none) {
334 *data = t->rbuf;
335 t->state = s->next_state;
336 if (s->return_type == t_tag)
337 strcpy (t->curtag, t->rbuf);
338 if (s->return_type == t_endtag)
339 *data = t->curtag;
340 if (s->return_type == t_closeemptytag)
341 *data = t->curtag;
342 return s->return_type;
343 }
344 if ((s->chars && is_oneof (t->c, s->chars)) ||
345 ((s->r_start + s->r_end)
346 && (t->c >= s->r_start && t->c <= s->r_end))) {
347 if (s->charhandling & c_store) {
348 t->rbuf[rbuflen++] = t->c;
349 t->rbuf[rbuflen] = 0;
350 }
351 if (s->charhandling & c_eat) {
352 t->c_held = 0;
353 }
354 t->state = s->next_state;
355 break;
356 }
357 s++;
358 }
359 }
360 return t_eof;
361 }
362
xml_tok_init(FILE * file_in)363 xml_tok_state *xml_tok_init (FILE * file_in)
364 {
365 xml_tok_state *ret;
366
367 ret = calloc (1, sizeof (xml_tok_state));
368 ret->file_in = file_in;
369 ret->state = s_start;
370 return ret;
371 }
372
xml_tok_cleanup(xml_tok_state * t)373 void xml_tok_cleanup (xml_tok_state * t)
374 {
375 free (t);
376 }
377
378 char *empty_tags[] = {
379 "img", "IMG", "br", "BR", "hr", "HR", "META", "meta", "link", "LINK", NULL
380 };
381
382 char *endomission_tags[] = {
383 "li", "LI", "p", "P", "td", "TD", "tr", "TR", NULL
384 };
385
string_is_oneof(char * s,char ** ss)386 int string_is_oneof (char *s, char **ss)
387 {
388 while (*ss) {
389 if (!strcmp (s, *ss))
390 return 1;
391 ss++;
392 }
393 return 0;
394 }
395
396
html_tok_get(xml_tok_state * s,char ** data)397 int html_tok_get (xml_tok_state * s, char **data)
398 {
399 static int got_a_stored_tag = 0;
400 static char stored_tag[4096];
401 static int stored_type = t_eof;
402 static char opentags[64][64];
403 static int level = 0;
404 char *rdata;
405 int type;
406
407 if (got_a_stored_tag) {
408 got_a_stored_tag = 0;
409 *data = (char *) &stored_tag;
410 return stored_type;
411 }
412
413 type = xml_tok_get (s, &rdata);
414
415 switch (type) {
416 case t_tag:
417 if (level >= 0) {
418 if (!strcmp (opentags[level - 1], rdata)) {
419 got_a_stored_tag = 1;
420 strcpy (stored_tag, rdata);
421 stored_type = type;
422 return t_closetag;
423 }
424 }
425 strcpy (opentags[level], rdata);
426 level++;
427 break;
428 case t_endtag:
429 *data = rdata;
430 if (string_is_oneof (rdata, empty_tags)) {
431 level--;
432 return t_closeemptytag;
433 }
434 break;
435 case t_closeemptytag:
436 case t_closetag: /* FIXME: do more than one level */
437 level--;
438 if (strcmp (opentags[level], rdata)) {
439 fprintf (stderr, "%s/%s\n", opentags[level], rdata);
440
441 got_a_stored_tag = 1;
442 stored_type = t_closetag;
443 strcpy (stored_tag, rdata);
444 level--;
445 return t_closetag;
446 }
447 break;
448 default:
449 break;
450 }
451
452 *data = rdata;
453 return type;
454 }
455