1 // Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
2 //
3 // This program is free software; you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License, version 2.0, as
5 // published by the Free Software Foundation.
6 //
7 // This program is also distributed with certain software (including
8 // but not limited to OpenSSL) that is licensed under separate terms,
9 // as designated in a particular file or component or in included license
10 // documentation. The authors of MySQL hereby grant you an
11 // additional permission to link the program and your derivative works
12 // with the separately licensed software that they have included with
13 // MySQL.
14 //
15 // Without limiting anything contained in the foregoing, this file,
16 // which is part of <MySQL Product>, is also subject to the
17 // Universal FOSS Exception, version 1.0, a copy of which can be found at
18 // http://oss.oracle.com/licenses/universal-foss-exception.
19 //
20 // This program is distributed in the hope that it will be useful, but
21 // WITHOUT ANY WARRANTY; without even the implied warranty of
22 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // See the GNU General Public License, version 2.0, for more details.
24 //
25 // You should have received a copy of the GNU General Public License
26 // along with this program; if not, write to the Free Software Foundation, Inc.,
27 // 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 
29 #include "my_global.h"
30 #include "m_string.h"
31 #include "my_xml.h"
32 
33 
34 #define MY_XML_UNKNOWN  'U'
35 #define MY_XML_EOF	'E'
36 #define MY_XML_STRING	'S'
37 #define MY_XML_IDENT	'I'
38 #define MY_XML_EQ	'='
39 #define MY_XML_LT	'<'
40 #define MY_XML_GT	'>'
41 #define MY_XML_SLASH	'/'
42 #define MY_XML_COMMENT	'C'
43 #define MY_XML_TEXT	'T'
44 #define MY_XML_QUESTION	'?'
45 #define MY_XML_EXCLAM   '!'
46 #define MY_XML_CDATA    'D'
47 
48 typedef struct xml_attr_st
49 {
50   const char *beg;
51   const char *end;
52 } MY_XML_ATTR;
53 
54 
55 /*
56   XML ctype:
57 */
58 #define	MY_XML_ID0  0x01 /* Identifier initial character */
59 #define	MY_XML_ID1  0x02 /* Identifier medial  character */
60 #define	MY_XML_SPC  0x08 /* Spacing character */
61 
62 
63 /*
64  http://www.w3.org/TR/REC-xml/
65  [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
66                   CombiningChar | Extender
67  [5] Name ::= (Letter | '_' | ':') (NameChar)*
68 */
69 
70 static char my_xml_ctype[256]=
71 {
72 /*00*/  0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
73 /*10*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
74 /*20*/  8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,  /*  !"#$%&'()*+,-./ */
75 /*30*/  2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0,  /* 0123456789:;<=>? */
76 /*40*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* @ABCDEFGHIJKLMNO */
77 /*50*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3,  /* PQRSTUVWXYZ[\]^_ */
78 /*60*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* `abcdefghijklmno */
79 /*70*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,  /* pqrstuvwxyz{|}~  */
80 /*80*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
81 /*90*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
82 /*A0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
83 /*B0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
84 /*C0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
85 /*D0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
86 /*E0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
87 /*F0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
88 };
89 
90 #define my_xml_is_space(c)  (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
91 #define my_xml_is_id0(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
92 #define my_xml_is_id1(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
93 
94 
lex2str(int lex)95 static const char *lex2str(int lex)
96 {
97   switch(lex)
98   {
99     case MY_XML_EOF:      return "END-OF-INPUT";
100     case MY_XML_STRING:   return "STRING";
101     case MY_XML_IDENT:    return "IDENT";
102     case MY_XML_CDATA:    return "CDATA";
103     case MY_XML_EQ:       return "'='";
104     case MY_XML_LT:       return "'<'";
105     case MY_XML_GT:       return "'>'";
106     case MY_XML_SLASH:    return "'/'";
107     case MY_XML_COMMENT:  return "COMMENT";
108     case MY_XML_TEXT:     return "TEXT";
109     case MY_XML_QUESTION: return "'?'";
110     case MY_XML_EXCLAM:   return "'!'";
111   }
112   return "unknown token";
113 }
114 
my_xml_norm_text(MY_XML_ATTR * a)115 static void my_xml_norm_text(MY_XML_ATTR *a)
116 {
117   for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
118   for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
119 }
120 
121 
122 static inline my_bool
my_xml_parser_prefix_cmp(MY_XML_PARSER * p,const char * s,size_t slen)123 my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
124 {
125   return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
126 }
127 
128 
my_xml_scan(MY_XML_PARSER * p,MY_XML_ATTR * a)129 static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
130 {
131   int lex;
132 
133   for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ;  p->cur++);
134 
135   if (p->cur >= p->end)
136   {
137     a->beg=p->end;
138     a->end=p->end;
139     lex=MY_XML_EOF;
140     goto ret;
141   }
142 
143   a->beg=p->cur;
144   a->end=p->cur;
145 
146   if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
147   {
148     for (; p->cur < p->end; p->cur++)
149     {
150       if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
151       {
152         p->cur+= 3;
153         break;
154       }
155     }
156     a->end=p->cur;
157     lex=MY_XML_COMMENT;
158   }
159   else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
160   {
161     p->cur+= 9;
162     for (; p->cur < p->end - 2 ; p->cur++)
163     {
164       if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
165       {
166         p->cur+= 3;
167         a->end= p->cur;
168         break;
169       }
170     }
171     lex= MY_XML_CDATA;
172   }
173   else if (strchr("?=/<>!",p->cur[0]))
174   {
175     p->cur++;
176     a->end=p->cur;
177     lex=a->beg[0];
178   }
179   else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
180   {
181     /*
182       "string" or 'string' found.
183       Scan until the closing quote/doublequote, or until the END-OF-INPUT.
184     */
185     p->cur++;
186     for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
187     {}
188     a->end=p->cur;
189     if (p->cur < p->end) /* Closing quote or doublequote has been found */
190       p->cur++;
191     a->beg++;
192     if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
193       my_xml_norm_text(a);
194     lex=MY_XML_STRING;
195   }
196   else if (my_xml_is_id0(p->cur[0]))
197   {
198     p->cur++;
199     while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
200       p->cur++;
201     a->end=p->cur;
202     my_xml_norm_text(a);
203     lex=MY_XML_IDENT;
204   }
205   else
206     lex= MY_XML_UNKNOWN;
207 
208 #if 0
209   printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
210 #endif
211 
212 ret:
213   return lex;
214 }
215 
216 
my_xml_value(MY_XML_PARSER * st,const char * str,size_t len)217 static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
218 {
219   return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
220 }
221 
222 
223 /**
224   Ensure the attr buffer is wide enough to hold the new value
225 
226   Expand and/or allocate dynamic buffer as needed to hold the concatenated
227   path and the terminating zero.
228 
229   @attr st   the parser instance
230   @attr len  the length of the attribute to be added
231   @return state
232   @retval 1  failed
233   @retval 0  success
234 */
my_xml_attr_ensure_space(MY_XML_PARSER * st,size_t len)235 static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
236 {
237   size_t ofs= st->attr.end - st->attr.start;
238   len++; // Add terminating zero.
239   if (ofs + len > st->attr.buffer_size)
240   {
241     st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
242                             st->attr.buffer_size * 2 + len : SIZE_T_MAX;
243 
244     if (!st->attr.buffer)
245     {
246       st->attr.buffer= (char *) my_str_malloc(st->attr.buffer_size);
247       if (st->attr.buffer)
248         memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
249     }
250     else
251       st->attr.buffer= (char *) my_str_realloc(st->attr.buffer,
252                                                st->attr.buffer_size);
253     st->attr.start= st->attr.buffer;
254     st->attr.end= st->attr.start + ofs;
255 
256     return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
257   }
258   return MY_XML_OK;
259 }
260 
261 
262 /** rewind the attr buffer to initial state */
my_xml_attr_rewind(MY_XML_PARSER * p)263 static void my_xml_attr_rewind(MY_XML_PARSER *p)
264 {
265   /* keep the buffer already allocated */
266   p->attr.end= p->attr.start;
267 }
268 
269 
my_xml_enter(MY_XML_PARSER * st,const char * str,size_t len)270 static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
271 {
272   if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
273     return MY_XML_ERROR;
274 
275   if (st->attr.end > st->attr.start)
276   {
277     st->attr.end[0]= '/';
278     st->attr.end++;
279   }
280   memcpy(st->attr.end, str, len);
281   st->attr.end+= len;
282   st->attr.end[0]= '\0';
283   if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
284     return st->enter ? st->enter(st, str, len) : MY_XML_OK;
285   else
286     return st->enter ?
287       st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
288 }
289 
290 
mstr(char * s,const char * src,size_t l1,size_t l2)291 static void mstr(char *s,const char *src,size_t l1, size_t l2)
292 {
293   l1 = l1<l2 ? l1 : l2;
294   memcpy(s,src,l1);
295   s[l1]='\0';
296 }
297 
298 
my_xml_leave(MY_XML_PARSER * p,const char * str,size_t slen)299 static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
300 {
301   char *e;
302   size_t glen;
303   char s[32];
304   char g[32];
305   int  rc;
306 
307   /* Find previous '/' or beginning */
308   for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
309   glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
310 
311   if (str && (slen != glen))
312   {
313     mstr(s,str,sizeof(s)-1,slen);
314     if (glen)
315     {
316       mstr(g,e+1,sizeof(g)-1,glen),
317       sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
318     }
319     else
320       sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
321     return MY_XML_ERROR;
322   }
323 
324   if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
325     rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
326   else
327     rc= (p->leave_xml ?
328          p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
329          MY_XML_OK);
330 
331   *e='\0';
332   p->attr.end= e;
333 
334   return rc;
335 }
336 
337 
my_xml_parse(MY_XML_PARSER * p,const char * str,size_t len)338 int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
339 {
340 
341   my_xml_attr_rewind(p);
342 
343   p->beg=str;
344   p->cur=str;
345   p->end=str+len;
346 
347   while ( p->cur < p->end )
348   {
349     MY_XML_ATTR a;
350     if (p->cur[0] == '<')
351     {
352       int lex;
353       int question=0;
354       int exclam=0;
355 
356       lex=my_xml_scan(p,&a);
357 
358       if (MY_XML_COMMENT == lex)
359         continue;
360 
361       if (lex == MY_XML_CDATA)
362       {
363         a.beg+= 9;
364         a.end-= 3;
365         my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
366         continue;
367       }
368 
369       lex=my_xml_scan(p,&a);
370 
371       if (MY_XML_SLASH == lex)
372       {
373         if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
374         {
375           sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
376           return MY_XML_ERROR;
377         }
378         if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
379           return MY_XML_ERROR;
380         lex=my_xml_scan(p,&a);
381         goto gt;
382       }
383 
384       if (MY_XML_EXCLAM == lex)
385       {
386         lex=my_xml_scan(p,&a);
387         exclam=1;
388       }
389       else if (MY_XML_QUESTION == lex)
390       {
391         lex=my_xml_scan(p,&a);
392         question=1;
393       }
394 
395       if (MY_XML_IDENT == lex)
396       {
397         p->current_node_type= MY_XML_NODE_TAG;
398         if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
399           return MY_XML_ERROR;
400       }
401       else
402       {
403         sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
404 		lex2str(lex));
405         return MY_XML_ERROR;
406       }
407 
408       while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
409              ((MY_XML_STRING == lex && exclam)))
410       {
411         MY_XML_ATTR b;
412         if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
413         {
414           lex=my_xml_scan(p,&b);
415           if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
416           {
417             p->current_node_type= MY_XML_NODE_ATTR;
418             if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))  ||
419                 (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg)))  ||
420                 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
421               return MY_XML_ERROR;
422           }
423           else
424           {
425             sprintf(p->errstr,"%s unexpected (ident or string wanted)",
426 		    lex2str(lex));
427             return MY_XML_ERROR;
428           }
429         }
430         else if (MY_XML_IDENT == lex)
431         {
432           p->current_node_type= MY_XML_NODE_ATTR;
433           if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
434               (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
435            return MY_XML_ERROR;
436         }
437         else if ((MY_XML_STRING == lex) && exclam)
438         {
439           /*
440             We are in <!DOCTYPE>, e.g.
441             <!DOCTYPE name SYSTEM "SystemLiteral">
442             <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
443             Just skip "SystemLiteral" and "PublicidLiteral"
444           */
445         }
446         else
447           break;
448       }
449 
450       if (lex == MY_XML_SLASH)
451       {
452         if (MY_XML_OK != my_xml_leave(p,NULL,0))
453           return MY_XML_ERROR;
454         lex=my_xml_scan(p,&a);
455       }
456 
457 gt:
458       if (question)
459       {
460         if (lex != MY_XML_QUESTION)
461         {
462           sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
463           return MY_XML_ERROR;
464         }
465         if (MY_XML_OK != my_xml_leave(p,NULL,0))
466           return MY_XML_ERROR;
467         lex=my_xml_scan(p,&a);
468       }
469 
470       if (exclam)
471       {
472         if (MY_XML_OK != my_xml_leave(p,NULL,0))
473           return MY_XML_ERROR;
474       }
475 
476       if (lex != MY_XML_GT)
477       {
478         sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
479         return MY_XML_ERROR;
480       }
481     }
482     else
483     {
484       a.beg=p->cur;
485       for ( ; (p->cur < p->end) && (p->cur[0] != '<')  ; p->cur++);
486       a.end=p->cur;
487 
488       if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
489         my_xml_norm_text(&a);
490       if (a.beg != a.end)
491       {
492         my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
493       }
494     }
495   }
496 
497   if (p->attr.start[0])
498   {
499     sprintf(p->errstr,"unexpected END-OF-INPUT");
500     return MY_XML_ERROR;
501   }
502   return MY_XML_OK;
503 }
504 
505 
my_xml_parser_create(MY_XML_PARSER * p)506 void my_xml_parser_create(MY_XML_PARSER *p)
507 {
508   memset(p, 0, sizeof(p[0]));
509   /*
510     Use static buffer while it's sufficient.
511   */
512   p->attr.start= p->attr.end= p->attr.static_buffer;
513   p->attr.buffer_size= sizeof(p->attr.static_buffer);
514 }
515 
516 
my_xml_parser_free(MY_XML_PARSER * p)517 void my_xml_parser_free(MY_XML_PARSER *p)
518 {
519   if (p->attr.buffer)
520   {
521     my_str_free(p->attr.buffer);
522     p->attr.buffer= NULL;
523   }
524 }
525 
526 
my_xml_set_value_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))527 void my_xml_set_value_handler(MY_XML_PARSER *p,
528 			      int (*action)(MY_XML_PARSER *p, const char *s,
529 					    size_t l))
530 {
531   p->value=action;
532 }
533 
my_xml_set_enter_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))534 void my_xml_set_enter_handler(MY_XML_PARSER *p,
535 			      int (*action)(MY_XML_PARSER *p, const char *s,
536 					    size_t l))
537 {
538   p->enter=action;
539 }
540 
541 
my_xml_set_leave_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))542 void my_xml_set_leave_handler(MY_XML_PARSER *p,
543 			      int (*action)(MY_XML_PARSER *p, const char *s,
544 					    size_t l))
545 {
546   p->leave_xml=action;
547 }
548 
549 
my_xml_set_user_data(MY_XML_PARSER * p,void * user_data)550 void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
551 {
552   p->user_data=user_data;
553 }
554 
555 
my_xml_error_string(MY_XML_PARSER * p)556 const char *my_xml_error_string(MY_XML_PARSER *p)
557 {
558   return p->errstr;
559 }
560 
561 
my_xml_error_pos(MY_XML_PARSER * p)562 size_t my_xml_error_pos(MY_XML_PARSER *p)
563 {
564   const char *beg=p->beg;
565   const char *s;
566   for ( s=p->cur - 1 ; s>p->beg - 1; s--)
567   {
568     if (s[0] == '\n')
569     {
570       beg=s;
571       break;
572     }
573   }
574   return (size_t) (p->cur-beg);
575 }
576 
my_xml_error_lineno(MY_XML_PARSER * p)577 uint my_xml_error_lineno(MY_XML_PARSER *p)
578 {
579   uint res=0;
580   const char *s;
581   for (s=p->beg ; s<p->cur; s++)
582   {
583     if (s[0] == '\n')
584       res++;
585   }
586   return res;
587 }
588