1 /*
2  * xml_tok.c -- pull based xml tokenizer
3  *
4  * Copyright (C) 2002 �yvind Kol�s <pippin@users.sourceforge.net>
5  *
6  * This program is free software; you can redistribute it and/or modify it under
7  * the terms of the GNU General Public License as published by the Free
8  * Software Foundation; either version 2, or (at your option) any later
9  * version.
10  *
11  * This program is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14  * more details.
15  *
16  * You should have received a copy of the GNU General Public License along with
17  * this program; if not, write to the Free Software Foundation, Inc., 59
18  * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19  */
20 
21 #include <string.h>
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <ctype.h>
25 #include "xml_tok.h"
26 
27 enum {
28 	s_null = 0,
29 	s_start,
30 	s_tag,
31 	s_tagnamestart,
32 	s_tagname,
33 	s_tagnamedone,
34 	s_intag,
35 	s_attstart,
36 	s_attname,
37 	s_attdone,
38 	s_att,
39 	s_atteq,
40 	s_eqquot,
41 	s_eqvalstart,
42 	s_eqapos,
43 	s_eqaposval,
44 	s_eqaposvaldone,
45 	s_eqval,
46 	s_eqvaldone,
47 	s_eqquotval,
48 	s_eqquotvaldone,
49 	s_tagend,
50 	s_empty,
51 	s_emptyend,
52 	s_whitespace,
53 	s_whitespacedone,
54 	s_entitystart,
55 	s_entity,
56 	s_entitydone,
57 	s_word,
58 	s_worddone,
59 	s_tagclose,
60 	s_tagclosenamestart,
61 	s_tagclosename,
62 	s_tagclosedone,
63 	s_tagexcl,
64 	s_commentdash1,
65 	s_commentdash2,
66 	s_incomment,
67 	s_commentenddash1,
68 	s_commentenddash2,
69 	s_commentdone,
70 	s_dtd,
71 	s_prolog,
72 	s_prologq,
73 	s_prologdone,
74 	s_eof,
75 	s_error
76 };
77 
78 
79 char *c_ws = " \n\r\t";
80 
81 enum {
82 	c_nil = 0,
83 	c_eat = 1,					/* request that another char be used for the next state */
84 	c_store = 2					/* store the current char in the output buffer */
85 };
86 
87 typedef struct {
88 	int state;
89 	char *chars;
90 	unsigned char r_start;
91 	unsigned char r_end;
92 	int next_state;
93 	int resetbuf;
94 	int charhandling;
95 	int return_type;			/* if set return current buf, with type set to the type */
96 } state_entry;
97 
98 #define max_entries 20
99 
100 static state_entry state_table[s_error][max_entries];
101 
a(int state,char * chars,unsigned char r_start,unsigned char r_end,int charhandling,int next_state)102 static void a (int state, char *chars, unsigned char r_start,
103 			   unsigned char r_end, int charhandling, int next_state)
104 {
105 	int no = 0;
106 
107 	while (state_table[state][no].state != s_null)
108 		no++;
109 	state_table[state][no].state = state;
110 	state_table[state][no].r_start = r_start;
111 	if (chars)
112 		state_table[state][no].chars = strdup (chars);
113 	state_table[state][no].r_end = r_end;
114 	state_table[state][no].charhandling = charhandling;
115 	state_table[state][no].next_state = next_state;
116 }
117 
r(int state,int return_type,int next_state)118 static void r (int state, int return_type, int next_state)
119 {
120 	state_table[state][0].state = state;
121 	state_table[state][0].return_type = return_type;
122 	state_table[state][0].next_state = next_state;
123 }
124 
125 /* *INDENT-OFF* */
126 
init_statetable(void)127 void init_statetable(void){
128 	static int inited=0;
129 	if(inited)return;
130 	inited=1;
131 	memset(state_table,0,sizeof(state_table));
132 	a(s_start,		"<",	0,0,			c_eat,			s_tag);
133 	a(s_start,		c_ws,	0,0,			c_eat+c_store,	s_whitespace);
134 	a(s_start,		"&",	0,0,			c_eat,			s_entitystart);
135 	a(s_start,		NULL,	0,255,			c_eat+c_store,	s_word);
136 
137 	a(s_tag,		c_ws,	0,0,			c_eat,			s_tag);
138 	a(s_tag,		"/",	0,0,			c_eat,			s_tagclose);
139 	a(s_tag,		"!",	0,0,			c_eat,			s_tagexcl);
140 	a(s_tag,		"?",	0,0,			c_eat,			s_prolog);
141 	a(s_tag,		NULL,	0,255,			c_eat+c_store,	s_tagnamestart);
142 
143 	a(s_tagclose,	NULL,	0,255,			c_eat+c_store,	s_tagclosenamestart);
144 	a(s_tagclosenamestart,	">",	0,0,	c_eat,			s_tagclosedone);
145 	a(s_tagclosenamestart,	NULL,	0,255,	c_eat+c_store,	s_tagclosename);
146 	a(s_tagclosename,	">",	0,0,		c_eat,			s_tagclosedone);
147 	a(s_tagclosename,	NULL,	0,255,		c_eat+c_store,	s_tagclosename);
148 	r(s_tagclosedone,	t_closetag,							s_start);
149 
150 	a(s_whitespace,		c_ws,	0,0,		c_eat+c_store,	s_whitespace);
151 	a(s_whitespace,		NULL,	0,255,		c_nil,			s_whitespacedone);
152 	r(s_whitespacedone,	t_whitespace,						s_start);
153 
154 	a(s_entitystart,";",	0,0,			c_eat,			s_entitydone);
155 	a(s_entitystart,NULL,	0,255,			c_eat+c_store,	s_entity);
156 	a(s_entity,		";",	0,0,			c_eat,			s_entitydone);
157 	a(s_entity,NULL,		0,255,			c_eat+c_store,	s_entity);
158 	r(s_entitydone,	t_entity,								s_start);
159 
160 	a(s_word,		c_ws,	0,0,			c_nil,			s_worddone);
161 	a(s_word,		"<&",	0,0,			c_nil,			s_worddone);
162 	a(s_word,		NULL,	0,255,			c_eat+c_store,	s_word);
163 	r(s_worddone,	t_word,									s_start);
164 
165 	a(s_tagnamestart,c_ws,	0,0,			c_nil,			s_tagnamedone);
166 	a(s_tagnamestart,	"/>",	0,0,		c_nil,			s_tagnamedone);
167 	a(s_tagnamestart,NULL,	0,255,			c_eat+c_store,	s_tagname);
168 	a(s_tagname,	c_ws,	0,0,			c_nil,			s_tagnamedone);
169 	a(s_tagname,	"/>",	0,0,			c_nil,			s_tagnamedone);
170 	a(s_tagname,	NULL,	0,255,			c_eat+c_store,	s_tagname);
171 	r(s_tagnamedone,	t_tag,								s_intag);
172 
173 	a(s_intag,		c_ws,	0,0,			c_eat,			s_intag);
174 	a(s_intag,		">",	0,0,			c_eat,			s_tagend);
175 	a(s_intag,		"/",	0,0,			c_eat,			s_empty);
176 	a(s_intag,		NULL,	0,255,			c_eat+c_store,	s_attstart);
177 
178 	a(s_attstart,	c_ws,	0,0,			c_eat,			s_attdone);
179 	a(s_attstart,	"=/>",	0,0,			c_nil,			s_attdone);
180 	a(s_attstart,	NULL,	0,255,			c_eat+c_store,	s_attname);
181 	a(s_attname,	"=/>",	0,0,			c_nil,			s_attdone);
182 	a(s_attname,	c_ws,	0,0,			c_eat,			s_attdone);
183 	a(s_attname,	NULL,	0,255,			c_eat+c_store,	s_attname);
184 	r(s_attdone,	t_att,									s_att);
185 	a(s_att,		c_ws,	0,0,			c_eat,			s_att);
186 	a(s_att,		"=",	0,0,			c_eat,			s_atteq);
187 	a(s_att,		NULL,	0,255,			c_eat,			s_intag);
188 	a(s_atteq,		"'",	0,0,			c_eat,			s_eqapos);
189 	a(s_atteq,		"\"",	0,0,			c_eat,			s_eqquot);
190 	a(s_atteq,		c_ws,	0,0,			c_eat,			s_atteq);
191 	a(s_atteq,		NULL,	0,255,			c_nil,			s_eqval);
192 
193 	a(s_eqapos,		"'",	0,0,			c_eat,			s_eqaposvaldone);
194 	a(s_eqapos,		NULL,	0,255,			c_eat+c_store,	s_eqaposval);
195 	a(s_eqaposval,		"'",	0,0,		c_eat,			s_eqaposvaldone);
196 	a(s_eqaposval,		NULL,	0,255,		c_eat+c_store,	s_eqaposval);
197 	r(s_eqaposvaldone,	t_val,									s_intag);
198 
199 	a(s_eqquot,		"\"",	0,0,			c_eat,			s_eqquotvaldone);
200 	a(s_eqquot,		NULL,	0,255,			c_eat+c_store,	s_eqquotval);
201 	a(s_eqquotval,		"\"",	0,0,		c_eat,			s_eqquotvaldone);
202 	a(s_eqquotval,		NULL,	0,255,		c_eat+c_store,	s_eqquotval);
203 	r(s_eqquotvaldone,	t_val,									s_intag);
204 
205 	a(s_eqval,		c_ws,	0,0,			c_nil,			s_eqvaldone);
206 	a(s_eqval,		"/>",	0,0,			c_nil,			s_eqvaldone);
207 	a(s_eqval,		NULL,	0,255,			c_eat+c_store,	s_eqval);
208 
209 	r(s_eqvaldone,	t_val,									s_intag);
210 
211 	r(s_tagend,		t_endtag,				s_start);
212 
213 	a(s_empty,		">",0,0,				c_eat,			s_emptyend);
214 	a(s_empty,		NULL,0,255,				c_eat,			s_empty);
215 	r(s_emptyend,	t_closeemptytag,						s_start);
216 
217 	a(s_prolog,		"?",0,0,				c_eat,			s_prologq);
218 	a(s_prolog,		NULL,0,255,				c_eat+c_store,	s_prolog);
219 
220 	a(s_prologq,	">",0,0,				c_eat,			s_prologdone);
221 	a(s_prologq,	NULL,0,255,				c_eat+c_store,	s_prolog);
222 	r(s_prologdone,	t_prolog,				s_start);
223 
224 	a(s_tagexcl,	"-",0,0,				c_eat,			s_commentdash1);
225 	a(s_tagexcl,	"D",0,0,				c_nil,			s_dtd);
226 	a(s_tagexcl,	NULL,0,255,				c_eat,			s_start);
227 
228 	a(s_commentdash1,	"-",0,0,				c_eat,			s_commentdash2);
229 	a(s_commentdash1,	NULL,0,255,				c_eat,			s_error);
230 
231 	a(s_commentdash2,	"-",0,0,				c_eat,			s_commentenddash1);
232 	a(s_commentdash2,	NULL,0,255,				c_eat+c_store,	s_incomment);
233 
234 	a(s_incomment   ,	"-",0,0,				c_eat,			s_commentenddash1);
235 	a(s_incomment   ,	NULL,0,255,				c_eat+c_store,	s_incomment);
236 
237 	a(s_commentenddash1,	"-",0,0,			c_eat,			s_commentenddash2);
238 	a(s_commentenddash1,	NULL,0,255,			c_eat+c_store,	s_incomment);
239 
240 	a(s_commentenddash2,	">",0,0,			c_eat,			s_commentdone);
241 	a(s_commentenddash2,	NULL,0,255,			c_eat+c_store,	s_incomment);
242 
243 	r(s_commentdone,	t_comment,				s_start);
244 
245 }
246 
247 /* *INDENT-ON* */
248 
is_oneof(char c,char * chars)249 static int is_oneof (char c, char *chars)
250 {
251 	while (*chars) {
252 		if (c == *chars)
253 			return 1;
254 		chars++;
255 	}
256 	return 0;
257 }
258 
nextchar(xml_tok_state * t)259 static int nextchar (xml_tok_state * t)
260 {
261 	int ret;
262 
263 	if(! (t->inbufpos<t->inbuflen) ){
264 		t->inbuflen= fread(t->inbuf,1,inbufsize,t->file_in);
265 		t->inbufpos=0;
266 		if(!t->inbuflen)
267 			return -1;
268 	}
269 
270 	ret=(int) t->inbuf[t->inbufpos++];
271 
272 	if(ret=='\n')
273 		t->line_no++;
274 
275 	return ret;
276 }
277 
xml_tok_get(xml_tok_state * t,char ** data)278 int xml_tok_get (xml_tok_state * t, char **data)
279 {
280 	int rbuflen = 0;
281 	state_entry *s;
282 
283 	init_statetable ();
284 	t->rbuf[rbuflen] = 0;
285 	while (2 + 2 == 4) {
286 		if (!t->c_held) {
287 			t->c = nextchar (t);
288 			if (t->c == -1)
289 				return t_eof;
290 			t->c_held = 1;
291 		}
292 		if (t->state == s_dtd) {	/* FIXME: should make better code for skipping DTD */
293 /*			int angle = 0;*/
294 			int squote = 0;
295 			int dquote = 0;
296 			int abracket = 1;
297 
298 /*			int sbracket = 0;*/
299 
300 			t->rbuf[rbuflen++] = t->c;
301 			t->rbuf[rbuflen] = 0;
302 
303 			while (abracket) {
304 				switch (t->c = nextchar (t)) {
305 					case -1:
306 						return t_eof;
307 					case '<':
308 						if ((!squote) && (!dquote))
309 							abracket++;
310 						break;
311 					case '>':
312 						if ((!squote) && (!dquote))
313 							abracket--;
314 						break;
315 					case '"':
316 					case '\'':
317 					case '[':
318 					case ']':
319 					default:
320 						break;
321 				}
322 				t->rbuf[rbuflen++] = t->c;
323 				t->rbuf[rbuflen] = 0;
324 			}
325 			t->c_held = 0;
326 			t->state = s_start;
327 			t->rbuf[--rbuflen] = 0;
328 
329 			return t_dtd;
330 		}
331 		s = &state_table[t->state][0];
332 		while (s->state) {
333 			if (s->return_type != t_none) {
334 				*data = t->rbuf;
335 				t->state = s->next_state;
336 				if (s->return_type == t_tag)
337 					strcpy (t->curtag, t->rbuf);
338 				if (s->return_type == t_endtag)
339 					*data = t->curtag;
340 				if (s->return_type == t_closeemptytag)
341 					*data = t->curtag;
342 				return s->return_type;
343 			}
344 			if ((s->chars && is_oneof (t->c, s->chars)) ||
345 				((s->r_start + s->r_end)
346 				 && (t->c >= s->r_start && t->c <= s->r_end))) {
347 				if (s->charhandling & c_store) {
348 					t->rbuf[rbuflen++] = t->c;
349 					t->rbuf[rbuflen] = 0;
350 				}
351 				if (s->charhandling & c_eat) {
352 					t->c_held = 0;
353 				}
354 				t->state = s->next_state;
355 				break;
356 			}
357 			s++;
358 		}
359 	}
360 	return t_eof;
361 }
362 
xml_tok_init(FILE * file_in)363 xml_tok_state *xml_tok_init (FILE * file_in)
364 {
365 	xml_tok_state *ret;
366 
367 	ret = calloc (1, sizeof (xml_tok_state));
368 	ret->file_in = file_in;
369 	ret->state = s_start;
370 	return ret;
371 }
372 
xml_tok_cleanup(xml_tok_state * t)373 void xml_tok_cleanup (xml_tok_state * t)
374 {
375 	free (t);
376 }
377 
378 char *empty_tags[] = {
379 	"img", "IMG", "br", "BR", "hr", "HR", "META", "meta", "link", "LINK", NULL
380 };
381 
382 char *endomission_tags[] = {
383 	"li", "LI", "p", "P", "td", "TD", "tr", "TR", NULL
384 };
385 
string_is_oneof(char * s,char ** ss)386 int string_is_oneof (char *s, char **ss)
387 {
388 	while (*ss) {
389 		if (!strcmp (s, *ss))
390 			return 1;
391 		ss++;
392 	}
393 	return 0;
394 }
395 
396 
html_tok_get(xml_tok_state * s,char ** data)397 int html_tok_get (xml_tok_state * s, char **data)
398 {
399 	static int got_a_stored_tag = 0;
400 	static char stored_tag[4096];
401 	static int stored_type = t_eof;
402 	static char opentags[64][64];
403 	static int level = 0;
404 	char *rdata;
405 	int type;
406 
407 	if (got_a_stored_tag) {
408 		got_a_stored_tag = 0;
409 		*data = (char *) &stored_tag;
410 		return stored_type;
411 	}
412 
413 	type = xml_tok_get (s, &rdata);
414 
415 	switch (type) {
416 		case t_tag:
417 			if (level >= 0) {
418 				if (!strcmp (opentags[level - 1], rdata)) {
419 					got_a_stored_tag = 1;
420 					strcpy (stored_tag, rdata);
421 					stored_type = type;
422 					return t_closetag;
423 				}
424 			}
425 			strcpy (opentags[level], rdata);
426 			level++;
427 			break;
428 		case t_endtag:
429 			*data = rdata;
430 			if (string_is_oneof (rdata, empty_tags)) {
431 				level--;
432 				return t_closeemptytag;
433 			}
434 			break;
435 		case t_closeemptytag:
436 		case t_closetag:		/* FIXME: do more than one level */
437 			level--;
438 			if (strcmp (opentags[level], rdata)) {
439 				fprintf (stderr, "%s/%s\n", opentags[level], rdata);
440 
441 				got_a_stored_tag = 1;
442 				stored_type = t_closetag;
443 				strcpy (stored_tag, rdata);
444 				level--;
445 				return t_closetag;
446 			}
447 			break;
448 		default:
449 			break;
450 	}
451 
452 	*data = rdata;
453 	return type;
454 }
455