1 /*
2  * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
3  * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
4  * 	All rights reserved.
5  * Redistribution and modifications are permitted subject to BSD license.
6  */
7 #include <asn_system.h>
8 #include <xer_support.h>
9 
10 /* Parser states */
11 typedef enum {
12 	ST_TEXT,
13 	ST_TAG_START,
14 	ST_TAG_BODY,
15 	ST_TAG_QUOTE_WAIT,
16 	ST_TAG_QUOTED_STRING,
17 	ST_TAG_UNQUOTED_STRING,
18 	ST_COMMENT_WAIT_DASH1,	/* "<!--"[1] */
19 	ST_COMMENT_WAIT_DASH2,	/* "<!--"[2] */
20 	ST_COMMENT,
21 	ST_COMMENT_CLO_DASH2,	/* "-->"[0] */
22 	ST_COMMENT_CLO_RT	/* "-->"[1] */
23 } pstate_e;
24 
25 static pxml_chunk_type_e final_chunk_type[] = {
26 	PXML_TEXT,
27 	PXML_TAG_END,
28 	PXML_COMMENT_END,
29 	PXML_TAG_END,
30 	PXML_COMMENT_END,
31 };
32 
33 
34 static int
35 _charclass[256] = {
36 	0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
37 	0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38 	1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39 	2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0,	/* 01234567 89       */
40 	0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,	/*  ABCDEFG HIJKLMNO */
41 	3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0,	/* PQRSTUVW XYZ      */
42 	0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,	/*  abcdefg hijklmno */
43 	3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0	/* pqrstuvw xyz      */
44 };
45 #define WHITESPACE(c)	(_charclass[(unsigned char)(c)] == 1)
46 #define ALNUM(c)	(_charclass[(unsigned char)(c)] >= 2)
47 #define ALPHA(c)	(_charclass[(unsigned char)(c)] == 3)
48 
49 /* Aliases for characters, ASCII/UTF-8 */
50 #define	EXCLAM	0x21	/* '!' */
51 #define	CQUOTE	0x22	/* '"' */
52 #define	CDASH	0x2d	/* '-' */
53 #define	CSLASH	0x2f	/* '/' */
54 #define	LANGLE	0x3c	/* '<' */
55 #define	CEQUAL	0x3d	/* '=' */
56 #define	RANGLE	0x3e	/* '>' */
57 #define	CQUEST	0x3f	/* '?' */
58 
59 /* Invoke token callback */
60 #define	TOKEN_CB_CALL(type, _ns, _current_too, _final) do {	\
61 		int _ret;					\
62 		pstate_e ns  = _ns;				\
63 		ssize_t _sz = (p - chunk_start) + _current_too;	\
64 		if (!_sz) {					\
65 			/* Shortcut */				\
66 			state = _ns;				\
67 			break;					\
68 		}						\
69 		_ret = cb(type, chunk_start, _sz, key);		\
70 		if(_ret < _sz) {				\
71 			if(_current_too && _ret == -1)		\
72 				state = ns;			\
73 			goto finish;				\
74 		}						\
75 		chunk_start = p + _current_too;			\
76 		state = ns;					\
77 	} while(0)
78 
79 #define TOKEN_CB(_type, _ns, _current_too)			\
80 	TOKEN_CB_CALL(_type, _ns, _current_too, 0)
81 
82 #define TOKEN_CB_FINAL(_type, _ns, _current_too)		\
83 	TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)
84 
85 /*
86  * Parser itself
87  */
pxml_parse(int * stateContext,const void * xmlbuf,size_t size,pxml_callback_f * cb,void * key)88 ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
89 	pstate_e state = (pstate_e)*stateContext;
90 	const char *chunk_start = (const char *)xmlbuf;
91 	const char *p = chunk_start;
92 	const char *end = p + size;
93 
94 	for(; p < end; p++) {
95 	  int C = *(const unsigned char *)p;
96 	  switch(state) {
97 	  case ST_TEXT:
98 		/*
99 		 * Initial state: we're in the middle of some text,
100 		 * or just have started.
101 		 */
102 		if (C == LANGLE)
103 			/* We're now in the tag, probably */
104 			TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
105 		break;
106 	  case ST_TAG_START:
107 		if (ALPHA(C) || (C == CSLASH))
108 			state = ST_TAG_BODY;
109 		else if (C == EXCLAM)
110 			state = ST_COMMENT_WAIT_DASH1;
111 		else
112 			/*
113 			 * Not characters and not whitespace.
114 			 * Must be something like "3 < 4".
115 			 */
116 			TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
117 		break;
118 	  case ST_TAG_BODY:
119 		switch(C) {
120 		case RANGLE:
121 			/* End of the tag */
122 			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
123 			break;
124 		case LANGLE:
125 			/*
126 			 * The previous tag wasn't completed, but still
127 			 * recognized as valid. (Mozilla-compatible)
128 			 */
129 			TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
130 			break;
131 		case CEQUAL:
132 			state = ST_TAG_QUOTE_WAIT;
133 			break;
134 		}
135 		break;
136 	  case ST_TAG_QUOTE_WAIT:
137 		/*
138 		 * State after the equal sign ("=") in the tag.
139 		 */
140 		switch(C) {
141 		case CQUOTE:
142 			state = ST_TAG_QUOTED_STRING;
143 			break;
144 		case RANGLE:
145 			/* End of the tag */
146 			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
147 			break;
148 		default:
149 			if(!WHITESPACE(C))
150 				/* Unquoted string value */
151 				state = ST_TAG_UNQUOTED_STRING;
152 		}
153 		break;
154 	  case ST_TAG_QUOTED_STRING:
155 		/*
156 		 * Tag attribute's string value in quotes.
157 		 */
158 		if(C == CQUOTE) {
159 			/* Return back to the tag state */
160 			state = ST_TAG_BODY;
161 		}
162 		break;
163 	  case ST_TAG_UNQUOTED_STRING:
164 		if(C == RANGLE) {
165 			/* End of the tag */
166 			TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
167 		} else if(WHITESPACE(C)) {
168 			/* Return back to the tag state */
169 			state = ST_TAG_BODY;
170 		}
171 		break;
172 	  case ST_COMMENT_WAIT_DASH1:
173 		if(C == CDASH) {
174 			state = ST_COMMENT_WAIT_DASH2;
175 		} else {
176 			/* Some ordinary tag. */
177 			state = ST_TAG_BODY;
178 		}
179 		break;
180 	  case ST_COMMENT_WAIT_DASH2:
181 		if(C == CDASH) {
182 			/* Seen "<--" */
183 			state = ST_COMMENT;
184 		} else {
185 			/* Some ordinary tag */
186 			state = ST_TAG_BODY;
187 		}
188 		break;
189 	  case ST_COMMENT:
190 		if(C == CDASH) {
191 			state = ST_COMMENT_CLO_DASH2;
192 		}
193 		break;
194 	  case ST_COMMENT_CLO_DASH2:
195 		if(C == CDASH) {
196 			state = ST_COMMENT_CLO_RT;
197 		} else {
198 			/* This is not an end of a comment */
199 			state = ST_COMMENT;
200 		}
201 		break;
202 	  case ST_COMMENT_CLO_RT:
203 		if(C == RANGLE) {
204 			TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
205 		} else if(C == CDASH) {
206 			/* Maintain current state, still waiting for '>' */
207 		} else {
208 			state = ST_COMMENT;
209 		}
210 		break;
211 	  } /* switch(*ptr) */
212 	} /* for() */
213 
214 	/*
215 	 * Flush the partially processed chunk, state permitting.
216 	 */
217 	if(p - chunk_start) {
218 		switch (state) {
219 		case ST_COMMENT:
220 			TOKEN_CB(PXML_COMMENT, state, 0);
221 			break;
222 		case ST_TEXT:
223 			TOKEN_CB(PXML_TEXT, state, 0);
224 			break;
225 		default: break;	/* a no-op */
226 		}
227 	}
228 
229 finish:
230 	*stateContext = (int)state;
231 	return chunk_start - (const char *)xmlbuf;
232 }
233 
234