1 /* Copyright (c) 2003, 2011, Oracle and/or its affiliates.
2    Copyright (c) 2011 Monty Program Ab
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA */
16 
17 #include "strings_def.h"
18 #include "m_string.h"
19 #include "my_xml.h"
20 #include "my_sys.h"
21 
22 
23 #define MY_XML_UNKNOWN  'U'
24 #define MY_XML_EOF	'E'
25 #define MY_XML_STRING	'S'
26 #define MY_XML_IDENT	'I'
27 #define MY_XML_EQ	'='
28 #define MY_XML_LT	'<'
29 #define MY_XML_GT	'>'
30 #define MY_XML_SLASH	'/'
31 #define MY_XML_COMMENT	'C'
32 #define MY_XML_TEXT	'T'
33 #define MY_XML_QUESTION	'?'
34 #define MY_XML_EXCLAM   '!'
35 #define MY_XML_CDATA    'D'
36 
37 typedef struct xml_attr_st
38 {
39   const char *beg;
40   const char *end;
41 } MY_XML_ATTR;
42 
43 
44 /*
45   XML ctype:
46 */
47 #define	MY_XML_ID0  0x01 /* Identifier initial character */
48 #define	MY_XML_ID1  0x02 /* Identifier medial  character */
49 #define	MY_XML_SPC  0x08 /* Spacing character */
50 
51 
52 /*
53  http://www.w3.org/TR/REC-xml/
54  [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
55                   CombiningChar | Extender
56  [5] Name ::= (Letter | '_' | ':') (NameChar)*
57 */
58 
59 static char my_xml_ctype[256]=
60 {
61 /*00*/  0,0,0,0,0,0,0,0,0,8,8,0,0,8,0,0,
62 /*10*/  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
63 /*20*/  8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,  /*  !"#$%&'()*+,-./ */
64 /*30*/  2,2,2,2,2,2,2,2,2,2,3,0,0,0,0,0,  /* 0123456789:;<=>? */
65 /*40*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* @ABCDEFGHIJKLMNO */
66 /*50*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,3,  /* PQRSTUVWXYZ[\]^_ */
67 /*60*/  0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* `abcdefghijklmno */
68 /*70*/  3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,  /* pqrstuvwxyz{|}~  */
69 /*80*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
70 /*90*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
71 /*A0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
72 /*B0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
73 /*C0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74 /*D0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75 /*E0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76 /*F0*/  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
77 };
78 
79 #define my_xml_is_space(c)  (my_xml_ctype[(uchar) (c)] & MY_XML_SPC)
80 #define my_xml_is_id0(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID0)
81 #define my_xml_is_id1(c)    (my_xml_ctype[(uchar) (c)] & MY_XML_ID1)
82 
83 
lex2str(int lex)84 static const char *lex2str(int lex)
85 {
86   switch(lex)
87   {
88     case MY_XML_EOF:      return "END-OF-INPUT";
89     case MY_XML_STRING:   return "STRING";
90     case MY_XML_IDENT:    return "IDENT";
91     case MY_XML_CDATA:    return "CDATA";
92     case MY_XML_EQ:       return "'='";
93     case MY_XML_LT:       return "'<'";
94     case MY_XML_GT:       return "'>'";
95     case MY_XML_SLASH:    return "'/'";
96     case MY_XML_COMMENT:  return "COMMENT";
97     case MY_XML_TEXT:     return "TEXT";
98     case MY_XML_QUESTION: return "'?'";
99     case MY_XML_EXCLAM:   return "'!'";
100   }
101   return "unknown token";
102 }
103 
my_xml_norm_text(MY_XML_ATTR * a)104 static void my_xml_norm_text(MY_XML_ATTR *a)
105 {
106   for ( ; (a->beg < a->end) && my_xml_is_space(a->beg[0]) ; a->beg++ );
107   for ( ; (a->beg < a->end) && my_xml_is_space(a->end[-1]) ; a->end-- );
108 }
109 
110 
111 static inline my_bool
my_xml_parser_prefix_cmp(MY_XML_PARSER * p,const char * s,size_t slen)112 my_xml_parser_prefix_cmp(MY_XML_PARSER *p, const char *s, size_t slen)
113 {
114   return (p->cur + slen > p->end) || memcmp(p->cur, s, slen);
115 }
116 
117 
my_xml_scan(MY_XML_PARSER * p,MY_XML_ATTR * a)118 static int my_xml_scan(MY_XML_PARSER *p,MY_XML_ATTR *a)
119 {
120   int lex;
121 
122   for (; ( p->cur < p->end) && my_xml_is_space(p->cur[0]) ;  p->cur++);
123 
124   if (p->cur >= p->end)
125   {
126     a->beg=p->end;
127     a->end=p->end;
128     lex=MY_XML_EOF;
129     goto ret;
130   }
131 
132   a->beg=p->cur;
133   a->end=p->cur;
134 
135   if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<!--")))
136   {
137     for (; p->cur < p->end; p->cur++)
138     {
139       if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("-->")))
140       {
141         p->cur+= 3;
142         break;
143       }
144     }
145     a->end=p->cur;
146     lex=MY_XML_COMMENT;
147   }
148   else if (!my_xml_parser_prefix_cmp(p, C_STRING_WITH_LEN("<![CDATA[")))
149   {
150     p->cur+= 9;
151     for (; p->cur < p->end - 2 ; p->cur++)
152     {
153       if (p->cur[0] == ']' && p->cur[1] == ']' && p->cur[2] == '>')
154       {
155         p->cur+= 3;
156         a->end= p->cur;
157         break;
158       }
159     }
160     lex= MY_XML_CDATA;
161   }
162   else if (strchr("?=/<>!",p->cur[0]))
163   {
164     p->cur++;
165     a->end=p->cur;
166     lex=a->beg[0];
167   }
168   else if ( (p->cur[0] == '"') || (p->cur[0] == '\'') )
169   {
170     /*
171       "string" or 'string' found.
172       Scan until the closing quote/doublequote, or until the END-OF-INPUT.
173     */
174     p->cur++;
175     for (; ( p->cur < p->end ) && (p->cur[0] != a->beg[0]); p->cur++)
176     {}
177     a->end=p->cur;
178     if (p->cur < p->end) /* Closing quote or doublequote has been found */
179       p->cur++;
180     a->beg++;
181     if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
182       my_xml_norm_text(a);
183     lex=MY_XML_STRING;
184   }
185   else if (my_xml_is_id0(p->cur[0]))
186   {
187     p->cur++;
188     while (p->cur < p->end && my_xml_is_id1(p->cur[0]))
189       p->cur++;
190     a->end=p->cur;
191     my_xml_norm_text(a);
192     lex=MY_XML_IDENT;
193   }
194   else
195     lex= MY_XML_UNKNOWN;
196 
197 #if 0
198   printf("LEX=%s[%d]\n",lex2str(lex),a->end-a->beg);
199 #endif
200 
201 ret:
202   return lex;
203 }
204 
205 
my_xml_value(MY_XML_PARSER * st,const char * str,size_t len)206 static int my_xml_value(MY_XML_PARSER *st, const char *str, size_t len)
207 {
208   return (st->value) ? (st->value)(st,str,len) : MY_XML_OK;
209 }
210 
211 
212 /**
213   Ensure the attr buffer is wide enough to hold the new value
214 
215   Expand and/or allocate dynamic buffer as needed to hold the concatenated
216   path and the terminating zero.
217 
218   @attr st   the parser instance
219   @attr len  the length of the attribute to be added
220   @return state
221   @retval 1  failed
222   @retval 0  success
223 */
my_xml_attr_ensure_space(MY_XML_PARSER * st,size_t len)224 static int my_xml_attr_ensure_space(MY_XML_PARSER *st, size_t len)
225 {
226   size_t ofs= st->attr.end - st->attr.start;
227   len++; // Add terminating zero.
228   if (ofs + len > st->attr.buffer_size)
229   {
230     st->attr.buffer_size= (SIZE_T_MAX - len) / 2 > st->attr.buffer_size ?
231                             st->attr.buffer_size * 2 + len : SIZE_T_MAX;
232 
233     if (!st->attr.buffer)
234     {
235       st->attr.buffer= (char *) my_malloc(st->attr.buffer_size, MYF(0));
236       if (st->attr.buffer)
237         memcpy(st->attr.buffer, st->attr.static_buffer, ofs + 1 /*term. zero */);
238     }
239     else
240       st->attr.buffer= (char *) my_realloc(st->attr.buffer,
241                                            st->attr.buffer_size, MYF(0));
242     st->attr.start= st->attr.buffer;
243     st->attr.end= st->attr.start + ofs;
244 
245     return st->attr.buffer ? MY_XML_OK : MY_XML_ERROR;
246   }
247   return MY_XML_OK;
248 }
249 
250 
251 /** rewind the attr buffer to initial state */
my_xml_attr_rewind(MY_XML_PARSER * p)252 static void my_xml_attr_rewind(MY_XML_PARSER *p)
253 {
254   /* keep the buffer already allocated */
255   p->attr.end= p->attr.start;
256 }
257 
258 
my_xml_enter(MY_XML_PARSER * st,const char * str,size_t len)259 static int my_xml_enter(MY_XML_PARSER *st, const char *str, size_t len)
260 {
261   if (my_xml_attr_ensure_space(st, len + 1 /* the separator char */))
262     return MY_XML_ERROR;
263 
264   if (st->attr.end > st->attr.start)
265   {
266     st->attr.end[0]= '/';
267     st->attr.end++;
268   }
269   memcpy(st->attr.end, str, len);
270   st->attr.end+= len;
271   st->attr.end[0]= '\0';
272   if (st->flags & MY_XML_FLAG_RELATIVE_NAMES)
273     return st->enter ? st->enter(st, str, len) : MY_XML_OK;
274   else
275     return st->enter ?
276       st->enter(st, st->attr.start, st->attr.end - st->attr.start) : MY_XML_OK;
277 }
278 
279 
mstr(char * s,const char * src,size_t l1,size_t l2)280 static void mstr(char *s,const char *src,size_t l1, size_t l2)
281 {
282   l1 = l1<l2 ? l1 : l2;
283   memcpy(s,src,l1);
284   s[l1]='\0';
285 }
286 
287 
my_xml_leave(MY_XML_PARSER * p,const char * str,size_t slen)288 static int my_xml_leave(MY_XML_PARSER *p, const char *str, size_t slen)
289 {
290   char *e, *tag;
291   size_t glen;
292   char s[32];
293   char g[32];
294   int  rc;
295 
296   /* Find previous '/' or beginning */
297   for (e= p->attr.end; (e > p->attr.start) && (e[0] != '/') ; e--);
298   glen= (size_t) ((e[0] == '/') ? (p->attr.end - e - 1) : p->attr.end - e);
299   tag= e[0] == '/' ? e + 1 : e;
300 
301   if (str && (slen != glen || memcmp(str, tag, slen)))
302   {
303     mstr(s,str,sizeof(s)-1,slen);
304     if (glen)
305     {
306       mstr(g, tag, sizeof(g)-1, glen);
307       sprintf(p->errstr,"'</%s>' unexpected ('</%s>' wanted)",s,g);
308     }
309     else
310       sprintf(p->errstr,"'</%s>' unexpected (END-OF-INPUT wanted)", s);
311     return MY_XML_ERROR;
312   }
313 
314   if (p->flags & MY_XML_FLAG_RELATIVE_NAMES)
315     rc= p->leave_xml ? p->leave_xml(p, str, slen) : MY_XML_OK;
316   else
317     rc= (p->leave_xml ?
318          p->leave_xml(p, p->attr.start, p->attr.end - p->attr.start) :
319          MY_XML_OK);
320 
321   *e='\0';
322   p->attr.end= e;
323 
324   return rc;
325 }
326 
327 
my_xml_parse(MY_XML_PARSER * p,const char * str,size_t len)328 int my_xml_parse(MY_XML_PARSER *p,const char *str, size_t len)
329 {
330 
331   my_xml_attr_rewind(p);
332 
333   p->beg=str;
334   p->cur=str;
335   p->end=str+len;
336 
337   while ( p->cur < p->end )
338   {
339     MY_XML_ATTR a;
340     if (p->cur[0] == '<')
341     {
342       int lex;
343       int question=0;
344       int exclam=0;
345 
346       lex=my_xml_scan(p,&a);
347 
348       if (MY_XML_COMMENT == lex)
349         continue;
350 
351       if (lex == MY_XML_CDATA)
352       {
353         a.beg+= 9;
354         a.end-= 3;
355         my_xml_value(p, a.beg, (size_t) (a.end-a.beg));
356         continue;
357       }
358 
359       lex=my_xml_scan(p,&a);
360 
361       if (MY_XML_SLASH == lex)
362       {
363         if (MY_XML_IDENT != (lex=my_xml_scan(p,&a)))
364         {
365           sprintf(p->errstr,"%s unexpected (ident wanted)",lex2str(lex));
366           return MY_XML_ERROR;
367         }
368         if (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg)))
369           return MY_XML_ERROR;
370         lex=my_xml_scan(p,&a);
371         goto gt;
372       }
373 
374       if (MY_XML_EXCLAM == lex)
375       {
376         lex=my_xml_scan(p,&a);
377         exclam=1;
378       }
379       else if (MY_XML_QUESTION == lex)
380       {
381         lex=my_xml_scan(p,&a);
382         question=1;
383       }
384 
385       if (MY_XML_IDENT == lex)
386       {
387         p->current_node_type= MY_XML_NODE_TAG;
388         if (MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))
389           return MY_XML_ERROR;
390       }
391       else
392       {
393         sprintf(p->errstr,"%s unexpected (ident or '/' wanted)",
394 		lex2str(lex));
395         return MY_XML_ERROR;
396       }
397 
398       while ((MY_XML_IDENT == (lex=my_xml_scan(p,&a))) ||
399              ((MY_XML_STRING == lex && exclam)))
400       {
401         MY_XML_ATTR b;
402         if (MY_XML_EQ == (lex=my_xml_scan(p,&b)))
403         {
404           lex=my_xml_scan(p,&b);
405           if ( (lex == MY_XML_IDENT) || (lex == MY_XML_STRING) )
406           {
407             p->current_node_type= MY_XML_NODE_ATTR;
408             if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg)))  ||
409                 (MY_XML_OK != my_xml_value(p,b.beg,(size_t) (b.end-b.beg)))  ||
410                 (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
411               return MY_XML_ERROR;
412           }
413           else
414           {
415             sprintf(p->errstr,"%s unexpected (ident or string wanted)",
416 		    lex2str(lex));
417             return MY_XML_ERROR;
418           }
419         }
420         else if (MY_XML_IDENT == lex)
421         {
422           p->current_node_type= MY_XML_NODE_ATTR;
423           if ((MY_XML_OK != my_xml_enter(p,a.beg,(size_t) (a.end-a.beg))) ||
424               (MY_XML_OK != my_xml_leave(p,a.beg,(size_t) (a.end-a.beg))))
425            return MY_XML_ERROR;
426         }
427         else if ((MY_XML_STRING == lex) && exclam)
428         {
429           /*
430             We are in <!DOCTYPE>, e.g.
431             <!DOCTYPE name SYSTEM "SystemLiteral">
432             <!DOCTYPE name PUBLIC "PublidLiteral" "SystemLiteral">
433             Just skip "SystemLiteral" and "PublicidLiteral"
434           */
435         }
436         else
437           break;
438       }
439 
440       if (lex == MY_XML_SLASH)
441       {
442         if (MY_XML_OK != my_xml_leave(p,NULL,0))
443           return MY_XML_ERROR;
444         lex=my_xml_scan(p,&a);
445       }
446 
447 gt:
448       if (question)
449       {
450         if (lex != MY_XML_QUESTION)
451         {
452           sprintf(p->errstr,"%s unexpected ('?' wanted)",lex2str(lex));
453           return MY_XML_ERROR;
454         }
455         if (MY_XML_OK != my_xml_leave(p,NULL,0))
456           return MY_XML_ERROR;
457         lex=my_xml_scan(p,&a);
458       }
459 
460       if (exclam)
461       {
462         if (MY_XML_OK != my_xml_leave(p,NULL,0))
463           return MY_XML_ERROR;
464       }
465 
466       if (lex != MY_XML_GT)
467       {
468         sprintf(p->errstr,"%s unexpected ('>' wanted)",lex2str(lex));
469         return MY_XML_ERROR;
470       }
471     }
472     else
473     {
474       a.beg=p->cur;
475       for ( ; (p->cur < p->end) && (p->cur[0] != '<')  ; p->cur++);
476       a.end=p->cur;
477 
478       if (!(p->flags & MY_XML_FLAG_SKIP_TEXT_NORMALIZATION))
479         my_xml_norm_text(&a);
480       if (a.beg != a.end)
481       {
482         my_xml_value(p,a.beg,(size_t) (a.end-a.beg));
483       }
484     }
485   }
486 
487   if (p->attr.start[0])
488   {
489     sprintf(p->errstr,"unexpected END-OF-INPUT");
490     return MY_XML_ERROR;
491   }
492   return MY_XML_OK;
493 }
494 
495 
my_xml_parser_create(MY_XML_PARSER * p)496 void my_xml_parser_create(MY_XML_PARSER *p)
497 {
498   memset(p, 0, sizeof(p[0]));
499   /*
500     Use static buffer while it's sufficient.
501   */
502   p->attr.start= p->attr.end= p->attr.static_buffer;
503   p->attr.buffer_size= sizeof(p->attr.static_buffer);
504 }
505 
506 
my_xml_parser_free(MY_XML_PARSER * p)507 void my_xml_parser_free(MY_XML_PARSER *p)
508 {
509   if (p->attr.buffer)
510   {
511     my_free(p->attr.buffer);
512     p->attr.buffer= NULL;
513   }
514 }
515 
516 
my_xml_set_value_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))517 void my_xml_set_value_handler(MY_XML_PARSER *p,
518 			      int (*action)(MY_XML_PARSER *p, const char *s,
519 					    size_t l))
520 {
521   p->value=action;
522 }
523 
my_xml_set_enter_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))524 void my_xml_set_enter_handler(MY_XML_PARSER *p,
525 			      int (*action)(MY_XML_PARSER *p, const char *s,
526 					    size_t l))
527 {
528   p->enter=action;
529 }
530 
531 
my_xml_set_leave_handler(MY_XML_PARSER * p,int (* action)(MY_XML_PARSER * p,const char * s,size_t l))532 void my_xml_set_leave_handler(MY_XML_PARSER *p,
533 			      int (*action)(MY_XML_PARSER *p, const char *s,
534 					    size_t l))
535 {
536   p->leave_xml=action;
537 }
538 
539 
my_xml_set_user_data(MY_XML_PARSER * p,void * user_data)540 void my_xml_set_user_data(MY_XML_PARSER *p, void *user_data)
541 {
542   p->user_data=user_data;
543 }
544 
545 
my_xml_error_string(MY_XML_PARSER * p)546 const char *my_xml_error_string(MY_XML_PARSER *p)
547 {
548   return p->errstr;
549 }
550 
551 
my_xml_error_pos(MY_XML_PARSER * p)552 size_t my_xml_error_pos(MY_XML_PARSER *p)
553 {
554   const char *beg=p->beg;
555   const char *s;
556   for ( s=p->beg ; s<p->cur; s++)
557   {
558     if (s[0] == '\n')
559       beg=s;
560   }
561   return (size_t) (p->cur-beg);
562 }
563 
my_xml_error_lineno(MY_XML_PARSER * p)564 uint my_xml_error_lineno(MY_XML_PARSER *p)
565 {
566   uint res=0;
567   const char *s;
568   for (s=p->beg ; s<p->cur; s++)
569   {
570     if (s[0] == '\n')
571       res++;
572   }
573   return res;
574 }
575