1 /**
2  * @file
3  * The following is a full featured parser for configuration files using
4  * basic format "key = value".
5  *
6  * Well, it's big, but it can properly manage spaces, empty lines,
7  * single and double-quoted strings, hex numbers, comments, semicolons
8  * and more. It also happens to be much more robust than the original one.
9  *
10  * @author zamaz
11  */
12 
13 #include <stddef.h>
14 #include <assert.h>
15 #include "ckvp.h"
16 
17 enum {
18 	STATE_ERROR = 1,
19 	STATE_BEGIN,     /**< initial state */
20 	STATE_COMMENT,   /**< currently in a comment */
21 	STATE_KEY,       /**< (key) currently in a key */
22 	STATE_KEYBS,     /**< (key) backslash */
23 	STATE_KEYBSX1,   /**< (key) first character of a hex value (\\x) */
24 	STATE_KEYBSX2,   /**< (key) second character of a hex value (\\x) */
25 	STATE_KEYSQ,     /**< (key) currently in a simple quoted key */
26 	STATE_KEYDQ,     /**< (key) currently in a double quoted key */
27 	STATE_KEYDQBS,   /**< (key) backslash while in double quotes */
28 	STATE_KEYDQBSX1, /**< (key) first value of \\x in double quotes */
29 	STATE_KEYDQBSX2, /**< (key) second value of \\x in double quotes */
30 	STATE_BEQ,       /**< before '=' between key and value */
31 	STATE_AEQ,       /**< after '=' between key and value */
32 	STATE_VALUE,     /**< (value) same as (key) things above, for values */
33 	STATE_VALBS,     /**< (value) backslash */
34 	STATE_VALBSX1,   /**< (value) first character of an hex value (\\x) */
35 	STATE_VALBSX2,   /**< (value) second character of a hex value (\\x) */
36 	STATE_VALSQ,     /**< (value) currently in a simple quoted value */
37 	STATE_VALDQ,     /**< (value) currently in a double quoted value */
38 	STATE_VALDQBS,   /**< (value) backslash while in double quotes */
39 	STATE_VALDQBSX1, /**< (value) first value of \\x in double quotes */
40 	STATE_VALDQBSX2, /**< (value) second values of \\x in double quotes */
41 	STATE_VALEND,    /**< end of a value, ready to take a new key */
42 	ACTION_KEY        = 0x0100, /**< key complete */
43 	ACTION_VALUE      = 0x0200, /**< value complete */
44 	ACTION_ERROR      = 0x0400, /**< caught an error */
45 	ACTION_STORE      = 0x1000, /**< character must be stored as is */
46 	ACTION_STORE_MOD  = 0x2000, /**< store filtered character */
47 	ACTION_STORE_HEX1 = 0x4000, /**< store first hex digit */
48 	ACTION_STORE_HEX2 = 0x8000  /**< store second hex digit */
49 };
50 
51 #define	HEX_INDICES(st)						\
52 	['0'] = (st), ['1'] = (st), ['2'] = (st), ['3'] = (st),	\
53 	['4'] = (st), ['5'] = (st), ['6'] = (st), ['7'] = (st),	\
54 	['8'] = (st), ['9'] = (st), ['a'] = (st), ['b'] = (st),	\
55 	['c'] = (st), ['d'] = (st), ['e'] = (st), ['f'] = (st),	\
56 	['A'] = (st), ['B'] = (st), ['C'] = (st), ['D'] = (st),	\
57 	['E'] = (st), ['F'] = (st)
58 
59 /**
60  * ckvp_parse() takes the current state (ckvp), a buffer in[size] and returns
61  * the number of characters processed.
62  *
63  * Each time ckvp_parse() returns, ckvp->state must be checked. If no error
64  * occured, ckvp_parse() must be called again with the remaining characters
65  * if any, otherwise the next input buffer.
66  *
67  * At the end of input, ckvp_parse() must be called with a zero size.
68  *
69  * This function doesn't allocate anything.
70  *
71  * @param[in,out] ckvp Current state.
72  * @param size Number of characters in buffer "in".
73  * @param in Input buffer to parse.
74  * @return Number of characters processed.
75  */
ckvp_parse(ckvp_t * ckvp,size_t size,const char in[])76 size_t ckvp_parse(ckvp_t *ckvp, size_t size, const char in[])
77 {
78 	/**
79 	 * State machine definition:
80 	 *
81 	 * st[current_state][current_character] = next state | action
82 	 *
83 	 * Special indices for current_character are:
84 	 *
85 	 * - 0x100 for action on characters not in the list
86 	 * - 0x101 for action when encountering end of input while in the
87 	 *         current state (often ACTION_ERROR)
88 	 */
89 	static const unsigned int st[][0x102] = {
90 		[STATE_ERROR] = {
91 			[0x100] = (STATE_ERROR | ACTION_ERROR),
92 			[0x101] = ACTION_ERROR
93 		},
94 		[STATE_BEGIN] = {
95 			[' '] = STATE_BEGIN,
96 			['\f'] = STATE_BEGIN,
97 			['\n'] = STATE_BEGIN,
98 			['\r'] = STATE_BEGIN,
99 			['\t'] = STATE_BEGIN,
100 			['\v'] = STATE_BEGIN,
101 			[';'] = (STATE_ERROR | ACTION_ERROR),
102 			['#'] = STATE_COMMENT,
103 			['\''] = STATE_KEYSQ,
104 			['"'] = STATE_KEYDQ,
105 			['\\'] = STATE_KEYBS,
106 			['='] = (STATE_ERROR | ACTION_ERROR),
107 			[0x100] = (STATE_KEY | ACTION_STORE),
108 			[0x101] = 0
109 		},
110 		[STATE_COMMENT] = {
111 			['\n'] = STATE_BEGIN,
112 			[0x100] = STATE_COMMENT,
113 			[0x101] = 0
114 		},
115 		[STATE_KEY] = {
116 			[' '] = (STATE_BEQ | ACTION_KEY),
117 			['\f'] = (STATE_BEQ | ACTION_KEY),
118 			['\n'] = (STATE_BEQ | ACTION_KEY),
119 			['\r'] = (STATE_BEQ | ACTION_KEY),
120 			['\t'] = (STATE_BEQ | ACTION_KEY),
121 			['\v'] = (STATE_BEQ | ACTION_KEY),
122 			['\''] = STATE_KEYSQ,
123 			['\"'] = STATE_KEYDQ,
124 			[';'] = (STATE_ERROR | ACTION_ERROR),
125 			['='] = (STATE_AEQ | ACTION_KEY),
126 			['#'] = (STATE_ERROR | ACTION_ERROR),
127 			['\\'] = STATE_KEYBS,
128 			[0x100] = (STATE_KEY | ACTION_STORE),
129 			[0x101] = ACTION_ERROR
130 		},
131 		[STATE_KEYBS] = {
132 			['f'] = (STATE_KEY | ACTION_STORE_MOD),
133 			['n'] = (STATE_KEY | ACTION_STORE_MOD),
134 			['r'] = (STATE_KEY | ACTION_STORE_MOD),
135 			['t'] = (STATE_KEY | ACTION_STORE_MOD),
136 			['v'] = (STATE_KEY | ACTION_STORE_MOD),
137 			['x'] = STATE_KEYBSX1,
138 			['\n'] = STATE_KEY,
139 			[0x100] = (STATE_KEY | ACTION_STORE),
140 			[0x101] = ACTION_ERROR
141 		},
142 		[STATE_KEYBSX1] = {
143 			HEX_INDICES(STATE_KEYBSX2 | ACTION_STORE_HEX1),
144 			[0x100] = (STATE_ERROR | ACTION_ERROR),
145 			[0x101] = ACTION_ERROR
146 		},
147 		[STATE_KEYBSX2] = {
148 			HEX_INDICES(STATE_KEY | ACTION_STORE_HEX2),
149 			[0x100] = (STATE_ERROR | ACTION_ERROR),
150 			[0x101] = ACTION_ERROR
151 		},
152 		[STATE_KEYSQ] = {
153 			['\''] = STATE_KEY,
154 			[0x100] = (STATE_KEYSQ | ACTION_STORE),
155 			[0x101] = ACTION_ERROR
156 		},
157 		[STATE_KEYDQ] = {
158 			['"'] = STATE_KEY,
159 			['\\'] = STATE_KEYDQBS,
160 			[0x100] = (STATE_KEYDQ | ACTION_STORE),
161 			[0x101] = ACTION_ERROR
162 		},
163 		[STATE_KEYDQBS] = {
164 			['f'] = (STATE_KEYDQ | ACTION_STORE_MOD),
165 			['n'] = (STATE_KEYDQ | ACTION_STORE_MOD),
166 			['r'] = (STATE_KEYDQ | ACTION_STORE_MOD),
167 			['t'] = (STATE_KEYDQ | ACTION_STORE_MOD),
168 			['v'] = (STATE_KEYDQ | ACTION_STORE_MOD),
169 			['x'] = STATE_KEYDQBSX1,
170 			['\n'] = STATE_KEYDQ,
171 			[0x100] = (STATE_KEYDQ | ACTION_STORE),
172 			[0x101] = ACTION_ERROR
173 		},
174 		[STATE_KEYDQBSX1] = {
175 			HEX_INDICES(STATE_KEYDQBSX2 | ACTION_STORE_HEX1),
176 			[0x100] = (STATE_ERROR | ACTION_ERROR),
177 			[0x101] = ACTION_ERROR
178 		},
179 		[STATE_KEYDQBSX2] = {
180 			HEX_INDICES(STATE_KEYDQ | ACTION_STORE_HEX2),
181 			[0x100] = (STATE_ERROR | ACTION_ERROR),
182 			[0x101] = ACTION_ERROR
183 		},
184 		[STATE_BEQ] = {
185 			[' '] = STATE_BEQ,
186 			['\f'] = STATE_BEQ,
187 			['\n'] = STATE_BEQ,
188 			['\r'] = STATE_BEQ,
189 			['\t'] = STATE_BEQ,
190 			['\v'] = STATE_BEQ,
191 			['='] = STATE_AEQ,
192 			[0x100] = (STATE_ERROR | ACTION_ERROR),
193 			[0x101] = ACTION_ERROR
194 		},
195 		[STATE_AEQ] = {
196 			[' '] = STATE_AEQ,
197 			['\f'] = STATE_AEQ,
198 			['\n'] = STATE_AEQ,
199 			['\r'] = STATE_AEQ,
200 			['\t'] = STATE_AEQ,
201 			['\v'] = STATE_AEQ,
202 			['\''] = STATE_VALSQ,
203 			['\"'] = STATE_VALDQ,
204 			['\\'] = STATE_VALBS,
205 			['='] = (STATE_ERROR | ACTION_ERROR),
206 			['#'] = (STATE_COMMENT | ACTION_VALUE),
207 			[';'] = (STATE_BEGIN | ACTION_VALUE),
208 			[0x100] = (STATE_VALUE | ACTION_STORE),
209 			[0x101] = ACTION_VALUE
210 		},
211 		[STATE_VALUE] = {
212 			[' '] = (STATE_VALEND | ACTION_VALUE),
213 			['\f'] = (STATE_VALEND | ACTION_VALUE),
214 			['\n'] = (STATE_BEGIN | ACTION_VALUE),
215 			['\r'] = (STATE_VALEND | ACTION_VALUE),
216 			['\t'] = (STATE_VALEND | ACTION_VALUE),
217 			['\v'] = (STATE_VALEND | ACTION_VALUE),
218 			['\''] = STATE_VALSQ,
219 			['\"'] = STATE_VALDQ,
220 			[';'] = (STATE_BEGIN | ACTION_VALUE),
221 			['='] = (STATE_ERROR | ACTION_ERROR),
222 			['#'] = (STATE_COMMENT | ACTION_VALUE),
223 			['\\'] = STATE_VALBS,
224 			[0x100] = (STATE_VALUE | ACTION_STORE),
225 			[0x101] = ACTION_VALUE
226 		},
227 		[STATE_VALBS] = {
228 			['f'] = (STATE_VALUE | ACTION_STORE_MOD),
229 			['n'] = (STATE_VALUE | ACTION_STORE_MOD),
230 			['r'] = (STATE_VALUE | ACTION_STORE_MOD),
231 			['t'] = (STATE_VALUE | ACTION_STORE_MOD),
232 			['v'] = (STATE_VALUE | ACTION_STORE_MOD),
233 			['x'] = STATE_VALBSX1,
234 			['\n'] = STATE_VALUE,
235 			[0x100] = (STATE_VALUE | ACTION_STORE),
236 			[0x101] = ACTION_ERROR
237 		},
238 		[STATE_VALBSX1] = {
239 			HEX_INDICES(STATE_VALBSX2 | ACTION_STORE_HEX1),
240 			[0x100] = (STATE_ERROR | ACTION_ERROR),
241 			[0x101] = ACTION_ERROR
242 		},
243 		[STATE_VALBSX2] = {
244 			HEX_INDICES(STATE_VALUE | ACTION_STORE_HEX2),
245 			[0x100] = (STATE_ERROR | ACTION_ERROR),
246 			[0x101] = ACTION_ERROR
247 		},
248 		[STATE_VALSQ] = {
249 			['\''] = STATE_VALUE,
250 			[0x100] = (STATE_VALSQ | ACTION_STORE),
251 			[0x101] = ACTION_ERROR
252 		},
253 		[STATE_VALDQ] = {
254 			['"'] = STATE_VALUE,
255 			['\\'] = STATE_VALDQBS,
256 			[0x100] = (STATE_VALDQ | ACTION_STORE),
257 			[0x101] = ACTION_ERROR
258 		},
259 		[STATE_VALDQBS] = {
260 			['f'] = (STATE_VALDQ | ACTION_STORE_MOD),
261 			['n'] = (STATE_VALDQ | ACTION_STORE_MOD),
262 			['r'] = (STATE_VALDQ | ACTION_STORE_MOD),
263 			['t'] = (STATE_VALDQ | ACTION_STORE_MOD),
264 			['v'] = (STATE_VALDQ | ACTION_STORE_MOD),
265 			['x'] = STATE_VALDQBSX1,
266 			['\n'] = STATE_VALDQ,
267 			[0x100] = (STATE_VALDQ | ACTION_STORE),
268 			[0x101] = ACTION_ERROR
269 		},
270 		[STATE_VALDQBSX1] = {
271 			HEX_INDICES(STATE_VALDQBSX2 | ACTION_STORE_HEX1),
272 			[0x100] = (STATE_ERROR | ACTION_ERROR),
273 			[0x101] = ACTION_ERROR
274 		},
275 		[STATE_VALDQBSX2] = {
276 			HEX_INDICES(STATE_VALDQ | ACTION_STORE_HEX2),
277 			[0x100] = (STATE_ERROR | ACTION_ERROR),
278 			[0x101] = ACTION_ERROR
279 		},
280 		[STATE_VALEND] = {
281 			[' '] = STATE_VALEND,
282 			['\f'] = STATE_VALEND,
283 			['\n'] = STATE_BEGIN,
284 			['\r'] = STATE_VALEND,
285 			['\t'] = STATE_VALEND,
286 			['\v'] = STATE_VALEND,
287 			[';'] = STATE_BEGIN,
288 			['#'] = STATE_COMMENT,
289 			[0x100] = (STATE_ERROR | ACTION_ERROR),
290 			[0x101] = 0
291 		}
292 	};
293 	static const unsigned char cv[] = {
294 		['f'] = '\f', ['n'] = '\n', ['r'] = '\r',
295 		['t'] = '\t', ['v'] = '\v'
296 	};
297 	static const unsigned char hb[] = {
298 		['0'] = 0x0, ['1'] = 0x1, ['2'] = 0x2, ['3'] = 0x3,
299 		['4'] = 0x4, ['5'] = 0x5, ['6'] = 0x6, ['7'] = 0x7,
300 		['8'] = 0x8, ['9'] = 0x9, ['a'] = 0xa, ['b'] = 0xb,
301 		['c'] = 0xc, ['d'] = 0xd, ['e'] = 0xe, ['f'] = 0xf,
302 		['A'] = 0xa, ['B'] = 0xb, ['C'] = 0xc, ['D'] = 0xd,
303 		['E'] = 0xe, ['F'] = 0xf
304 	};
305 	size_t i;
306 
307 	assert(sizeof(unsigned int) >= 4);
308 	assert(ckvp != NULL);
309 	assert(in != NULL);
310 	if (ckvp->state != CKVP_NONE) {
311 		ckvp->out_size = 0;
312 		ckvp->state = CKVP_NONE;
313 	}
314 	if (ckvp->internal & 0x00010000) {
315 		++(ckvp->line);
316 		ckvp->column = 1;
317 	}
318 	else if (ckvp->internal & 0x00020000)
319 		++(ckvp->column);
320 	ckvp->internal &= ~(0x00030000);
321 	if (size == 0) {
322 		assert((ckvp->internal & 0x00ff) != 0x00);
323 		assert((ckvp->internal & 0x00ff) <= STATE_VALEND);
324 		if (st[(ckvp->internal & 0x00ff)][0x101] & ACTION_ERROR)
325 			ckvp->state = CKVP_ERROR;
326 		else if (st[(ckvp->internal & 0x00ff)][0x101] & ACTION_VALUE)
327 			ckvp->state = CKVP_OUT_VALUE;
328 		return 0;
329 	}
330 	for (i = 0; (i < size); ++i) {
331 		unsigned char c = in[i];
332 		unsigned int newst;
333 
334 		assert((ckvp->internal & 0x00ff) != 0x00);
335 		assert((ckvp->internal & 0x00ff) <= STATE_VALEND);
336 		if ((newst = st[(ckvp->internal & 0x00ff)][(c & 0xff)]) == 0)
337 			newst = st[(ckvp->internal & 0x00ff)][0x100];
338 		ckvp->internal = ((ckvp->internal & 0xffff0000) | newst);
339 		assert(newst != 0);
340 		if (newst & 0x0f00) {
341 			if (newst & ACTION_ERROR)
342 				ckvp->state = CKVP_ERROR;
343 			else if (newst & ACTION_KEY)
344 				ckvp->state = CKVP_OUT_KEY;
345 			else if (newst & ACTION_VALUE)
346 				ckvp->state = CKVP_OUT_VALUE;
347 			goto endnl;
348 		}
349 		if (newst & 0xf000) {
350 			if (newst & ACTION_STORE_HEX1) {
351 				ckvp->internal &= ~(0x00f00000);
352 				ckvp->internal |= (hb[c] << 20);
353 				continue;
354 			}
355 			else if (newst & ACTION_STORE_HEX2)
356 				c = (((ckvp->internal >> 16) & 0xf0) | hb[c]);
357 			else if (newst & ACTION_STORE_MOD)
358 				c = cv[c];
359 			if (ckvp->out_size == CKVP_OUT_SIZE) {
360 				ckvp->out[0] = c;
361 				ckvp->out_size = 1;
362 			}
363 			else
364 				ckvp->out[((ckvp->out_size)++)] = c;
365 			if (ckvp->out_size == CKVP_OUT_SIZE) {
366 				ckvp->state = CKVP_OUT_FULL;
367 				goto endnl;
368 			}
369 		}
370 		if (c == '\n') {
371 			++(ckvp->line);
372 			ckvp->column = 1;
373 		}
374 		else
375 			++(ckvp->column);
376 		continue;
377 	endnl:
378 		if (c == '\n')
379 			ckvp->internal |= 0x00010000;
380 		else
381 			ckvp->internal |= 0x00020000;
382 		return ++i;
383 	}
384 	return size;
385 }
386