1 // Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
2 //
3 // This program is free software; you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License, version 2.0, as
5 // published by the Free Software Foundation.
6 //
7 // This program is also distributed with certain software (including
8 // but not limited to OpenSSL) that is licensed under separate terms,
9 // as designated in a particular file or component or in included license
10 // documentation. The authors of MySQL hereby grant you an
11 // additional permission to link the program and your derivative works
12 // with the separately licensed software that they have included with
13 // MySQL.
14 //
15 // Without limiting anything contained in the foregoing, this file,
16 // which is part of <MySQL Product>, is also subject to the
17 // Universal FOSS Exception, version 1.0, a copy of which can be found at
18 // http://oss.oracle.com/licenses/universal-foss-exception.
19 //
20 // This program is distributed in the hope that it will be useful, but
21 // WITHOUT ANY WARRANTY; without even the implied warranty of
22 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // See the GNU General Public License, version 2.0, for more details.
24 //
25 // You should have received a copy of the GNU General Public License
26 // along with this program; if not, write to the Free Software Foundation, Inc.,
27 // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28
29 #include "my_global.h"
30 #include "m_string.h"
31 #include "my_xml.h"
32
33
34 #define MY_XML_UNKNOWN 'U'
35 #define MY_XML_EOF 'E'
36 #define MY_XML_STRING 'S'
37 #define MY_XML_IDENT 'I'
38 #define MY_XML_EQ '='
39 #define MY_XML_LT '<'
40 #define MY_XML_GT '>'
41 #define MY_XML_SLASH '/'
42 #define MY_XML_COMMENT 'C'
43 #define MY_XML_TEXT 'T'
44 #define MY_XML_QUESTION '?'
45 #define MY_XML_EXCLAM '!'
46 #define MY_XML_CDATA 'D'
47
48 typedef struct xml_attr_st
49 {
50 const char *beg;
51 const char *end;
52 } MY_XML_ATTR;
53
54
55 /*
56 XML ctype:
57 */
58 #define MY_XML_ID0 0x01 /* Identifier initial character */
59 #define MY_XML_ID1 0x02 /* Identifier medial character */
60 #define MY_XML_SPC 0x08 /* Spacing character */
61
62
63 /*
64 http://www.w3.org/TR/REC-xml/
65 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
66 CombiningChar | Extender
67 [5] Name ::= (Letter | '_' | ':') (NameChar)*
68 */
69
70 static char my_xml_ctype[256]=
71 {
72 /*00*/ 0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
73 /*10*/ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
74 /*20*/ 8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0, /* !"#$%&'()*+,-./ */
75 /*30*/ 2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0, /* 0123456789:;<=>? */
76 /*40*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* @ABCDEFGHIJKLMNO */
77 /*50*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3, /* PQRSTUVWXYZ[\]^_ */
78 /*60*/ 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* `abcdefghijklmno */
79 /*70*/ 3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0, /* pqrstuvwxyz{|}~ */
80 /*80*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
81 /*90*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
82 /*A0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
83 /*B0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
84 /*C0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
85 /*D0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
86 /*E0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
87 /*F0*/ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
88 };
89
90 #define my_xml_is_space(c) (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
91 #define my_xml_is_id0(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
92 #define my_xml_is_id1(c) (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
93
94
lex2str(int lex)95 static const char *lex2str(int lex)
96 {
97 switch(lex)
98 {
99 case MY_XML_EOF: return "END-OF-INPUT";
100 case MY_XML_STRING: return "STRING";
101 case MY_XML_IDENT: return "IDENT";
102 case MY_XML_CDATA: return "CDATA";
103 case MY_XML_EQ: return "'='";
104 case MY_XML_LT: return "'<'";
105 case MY_XML_GT: return "'>'";
106 case MY_XML_SLASH: return "'/'";
107 case MY_XML_COMMENT: return "COMMENT";
108 case MY_XML_TEXT: return "TEXT";
109 case MY_XML_QUESTION: return "'?'";
110 case MY_XML_EXCLAM: return "'!'";
111 }
112 return "unknown token";
113 }
114
my_xml_norm_text(MY_XML_ATTR * a)115 static void my_xml_norm_text(MY_XML_ATTR *a)
116 {
117 for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
118 for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
119 }
120
121
122 static inline my_bool
my_xml_parser_prefix_cmp(MY_XML_PARSER * p,const char * s,size_t slen)123 my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
124 {
125 return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
126 }
127
128
my_xml_scan(MY_XML_PARSER * p,MY_XML_ATTR * a)129 static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
130 {
131 int lex;
132
133 for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ; p->cur++);
134
135 if (p->cur >= p->end)
136 {
137 a->beg=p->end;
138 a->end=p->end;
139 lex=MY_XML_EOF;
140 goto ret;
141 }
142
143 a->beg=p->cur;
144 a->end=p->cur;
145
146 if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
147 {
148 for (; p->cur < p->end; p->cur++)
149 {
150 if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
151 {
152 p->cur+= 3;
153 break;
154 }
155 }
156 a->end=p->cur;
157 lex=MY_XML_COMMENT;
158 }
159 else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
160 {
161 p->cur+= 9;
162 for (; p->cur < p->end - 2 ; p->cur++)
163 {
164 if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
165 {
166 p->cur+= 3;
167 a->end= p->cur;
168 break;
169 }
170 }
171 lex= MY_XML_CDATA;
172 }
173 else if (strchr("?=/<>!",p->cur[0]))
174 {
175 p->cur++;
176 a->end=p->cur;
177 lex=a->beg[0];
178 }
179 else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
180 {
181 /*
182 "string" or 'string' found.
183 Scan until the closing quote/doublequote, or until the END-OF-INPUT.
184 */
185 p->cur++;
186 for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
187 {}
188 a->end=p->cur;
189 if (p->cur < p->end) /* Closing quote or doublequote has been found */
190 p->cur++;
191 a->beg++;
192 if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
193 my_xml_norm_text(a);
194 lex=MY_XML_STRING;
195 }
196 else if (my_xml_is_id0(p->cur[0]))
197 {
198 p->cur++;
199 while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
200 p->cur++;
201 a->end=p->cur;
202 my_xml_norm_text(a);
203 lex=MY_XML_IDENT;
204 }
205 else
206 lex= MY_XML_UNKNOWN;
207
208 #if 0
209 printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
210 #endif
211
212 ret:
213 return lex;
214 }
215
216
my_xml_value(MY_XML_PARSER * st,const char * str,size_t len)217 static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
218 {
219 return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
220 }
221
222
223 /**
224 Ensure the attr buffer is wide enough to hold the new value
225
226 Expand and/or allocate dynamic buffer as needed to hold the concatenated
227 path and the terminating zero.
228
229 @attr st the parser instance
230 @attr len the length of the attribute to be added
231 @return state
232 @retval 1 failed
233 @retval 0 success
234 */
my_xml_attr_ensure_space(MY_XML_PARSER * st,size_t len)235 static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
236 {
237 size_t ofs= st->attr.end - st->attr.start;
238 len++; // Add terminating zero.
239 if (ofs + len > st->attr.buffer_size)
240 {
241 st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
242 st->attr.buffer_size * 2 + len : SIZE_T_MAX;
243
244 if (!st->attr.buffer)
245 {
246 st->attr.buffer= (char *) my_str_malloc(st->attr.buffer_size);
247 if (st->attr.buffer)
248 memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
249 }
250 else
251 st->attr.buffer= (char *) my_str_realloc(st->attr.buffer,
252 st->attr.buffer_size);
253 st->attr.start= st->attr.buffer;
254 st->attr.end= st->attr.start + ofs;
255
256 return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
257 }
258 return MY_XML_OK;
259 }
260
261
262 /** rewind the attr buffer to initial state */
my_xml_attr_rewind(MY_XML_PARSER * p)263 static void my_xml_attr_rewind(MY_XML_PARSER *p)
264 {
265 /* keep the buffer already allocated */
266 p->attr.end= p->attr.start;
267 }
268
269
my_xml_enter(MY_XML_PARSER * st,const char * str,size_t len)270 static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
271 {
272 if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
273 return MY_XML_ERROR;
274
275 if (st->attr.end > st->attr.start)
276 {
277 st->attr.end[0]= '/';
278 st->attr.end++;
279 }
280 memcpy(st->attr.end, str, len);
281 st->attr.end+= len;
282 st->attr.end[0]= '\0';
283 if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
284 return st->enter ? st->enter(st, str, len) : MY_XML_OK;
285 else
286 return st->enter ?
287 st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
288 }
289
290
mstr(char * s,const char * src,size_t l1,size_t l2)291 static void mstr(char *s,const char *src,size_t l1, size_t l2)
292 {
293 l1 = l1<l2 ? l1 : l2;
294 memcpy(s,src,l1);
295 s[l1]='\0';
296 }
297
298
my_xml_leave(MY_XML_PARSER * p,const char * str,size_t slen)299 static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
300 {
301 char *e;
302 size_t glen;
303 char s[32];
304 char g[32];
305 int rc;
306
307 /* Find previous '/' or beginning */
308 for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
309 glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
310
311 if (str && (slen != glen))
312 {
313 mstr(s,str,sizeof(s)-1,slen);
314 if (glen)
315 {
316 mstr(g,e+1,sizeof(g)-1,glen),
317 sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
318 }
319 else
320 sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
321 return MY_XML_ERROR;
322 }
323
324 if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
325 rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
326 else
327 rc= (p->leave_xml ?
328 p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
329 MY_XML_OK);
330
331 *e='\0';
332 p->attr.end= e;
333
334 return rc;
335 }
336
337
my_xml_parse(MY_XML_PARSER * p,const char * str,size_t len)338 int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
339 {
340
341 my_xml_attr_rewind(p);
342
343 p->beg=str;
344 p->cur=str;
345 p->end=str+len;
346
347 while ( p->cur < p->end )
348 {
349 MY_XML_ATTR a;
350 if (p->cur[0] == '<')
351 {
352 int lex;
353 int question=0;
354 int exclam=0;
355
356 lex=my_xml_scan(p,&a);
357
358 if (MY_XML_COMMENT == lex)
359 continue;
360
361 if (lex == MY_XML_CDATA)
362 {
363 a.beg+= 9;
364 a.end-= 3;
365 my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
366 continue;
367 }
368
369 lex=my_xml_scan(p,&a);
370
371 if (MY_XML_SLASH == lex)
372 {
373 if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
374 {
375 sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
376 return MY_XML_ERROR;
377 }
378 if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
379 return MY_XML_ERROR;
380 lex=my_xml_scan(p,&a);
381 goto gt;
382 }
383
384 if (MY_XML_EXCLAM == lex)
385 {
386 lex=my_xml_scan(p,&a);
387 exclam=1;
388 }
389 else if (MY_XML_QUESTION == lex)
390 {
391 lex=my_xml_scan(p,&a);
392 question=1;
393 }
394
395 if (MY_XML_IDENT == lex)
396 {
397 p->current_node_type= MY_XML_NODE_TAG;
398 if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
399 return MY_XML_ERROR;
400 }
401 else
402 {
403 sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
404 lex2str(lex));
405 return MY_XML_ERROR;
406 }
407
408 while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
409 ((MY_XML_STRING == lex && exclam)))
410 {
411 MY_XML_ATTR b;
412 if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
413 {
414 lex=my_xml_scan(p,&b);
415 if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
416 {
417 p->current_node_type= MY_XML_NODE_ATTR;
418 if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
419 (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg))) ||
420 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
421 return MY_XML_ERROR;
422 }
423 else
424 {
425 sprintf(p->errstr,"%s unexpected (ident or string wanted)",
426 lex2str(lex));
427 return MY_XML_ERROR;
428 }
429 }
430 else if (MY_XML_IDENT == lex)
431 {
432 p->current_node_type= MY_XML_NODE_ATTR;
433 if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
434 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
435 return MY_XML_ERROR;
436 }
437 else if ((MY_XML_STRING == lex) && exclam)
438 {
439 /*
440 We are in <!DOCTYPE>, e.g.
441 <!DOCTYPE name SYSTEM "SystemLiteral">
442 <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
443 Just skip "SystemLiteral" and "PublicidLiteral"
444 */
445 }
446 else
447 break;
448 }
449
450 if (lex == MY_XML_SLASH)
451 {
452 if (MY_XML_OK != my_xml_leave(p,NULL,0))
453 return MY_XML_ERROR;
454 lex=my_xml_scan(p,&a);
455 }
456
457 gt:
458 if (question)
459 {
460 if (lex != MY_XML_QUESTION)
461 {
462 sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
463 return MY_XML_ERROR;
464 }
465 if (MY_XML_OK != my_xml_leave(p,NULL,0))
466 return MY_XML_ERROR;
467 lex=my_xml_scan(p,&a);
468 }
469
470 if (exclam)
471 {
472 if (MY_XML_OK != my_xml_leave(p,NULL,0))
473 return MY_XML_ERROR;
474 }
475
476 if (lex != MY_XML_GT)
477 {
478 sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
479 return MY_XML_ERROR;
480 }
481 }
482 else
483 {
484 a.beg=p->cur;
485 for ( ; (p->cur < p->end) && (p->cur[0] != '<') ; p->cur++);
486 a.end=p->cur;
487
488 if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
489 my_xml_norm_text(&a);
490 if (a.beg != a.end)
491 {
492 my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
493 }
494 }
495 }
496
497 if (p->attr.start[0])
498 {
499 sprintf(p->errstr,"unexpected END-OF-INPUT");
500 return MY_XML_ERROR;
501 }
502 return MY_XML_OK;
503 }
504
505
my_xml_parser_create(MY_XML_PARSER * p)506 void my_xml_parser_create(MY_XML_PARSER *p)
507 {
508 memset(p, 0, sizeof(p[0]));
509 /*
510 Use static buffer while it's sufficient.
511 */
512 p->attr.start= p->attr.end= p->attr.static_buffer;
513 p->attr.buffer_size= sizeof(p->attr.static_buffer);
514 }
515
516
my_xml_parser_free(MY_XML_PARSER * p)517 void my_xml_parser_free(MY_XML_PARSER *p)
518 {
519 if (p->attr.buffer)
520 {
521 my_str_free(p->attr.buffer);
522 p->attr.buffer= NULL;
523 }
524 }
525
526
my_xml_set_value_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))527 void my_xml_set_value_handler(MY_XML_PARSER *p,
528 int (*action)(MY_XML_PARSER *p, const char *s,
529 size_t l))
530 {
531 p->value=action;
532 }
533
my_xml_set_enter_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))534 void my_xml_set_enter_handler(MY_XML_PARSER *p,
535 int (*action)(MY_XML_PARSER *p, const char *s,
536 size_t l))
537 {
538 p->enter=action;
539 }
540
541
my_xml_set_leave_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))542 void my_xml_set_leave_handler(MY_XML_PARSER *p,
543 int (*action)(MY_XML_PARSER *p, const char *s,
544 size_t l))
545 {
546 p->leave_xml=action;
547 }
548
549
my_xml_set_user_data(MY_XML_PARSER * p,void * user_data)550 void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
551 {
552 p->user_data=user_data;
553 }
554
555
my_xml_error_string(MY_XML_PARSER * p)556 const char *my_xml_error_string(MY_XML_PARSER *p)
557 {
558 return p->errstr;
559 }
560
561
my_xml_error_pos(MY_XML_PARSER * p)562 size_t my_xml_error_pos(MY_XML_PARSER *p)
563 {
564 const char *beg=p->beg;
565 const char *s;
566 for ( s=p->cur - 1 ; s>p->beg - 1; s--)
567 {
568 if (s[0] == '\n')
569 {
570 beg=s;
571 break;
572 }
573 }
574 return (size_t) (p->cur-beg);
575 }
576
my_xml_error_lineno(MY_XML_PARSER * p)577 uint my_xml_error_lineno(MY_XML_PARSER *p)
578 {
579 uint res=0;
580 const char *s;
581 for (s=p->beg ; s<p->cur; s++)
582 {
583 if (s[0] == '\n')
584 res++;
585 }
586 return res;
587 }
588