1 /*  Part of SWI-Prolog
2 
3     Author:        Jan Wielemaker
4     E-mail:        J.Wielemaker@vu.nl
5     WWW:           http://www.swi-prolog.org
6     Copyright (c)  2002-2015, University of Amsterdam
7                               VU University Amsterdam
8     All rights reserved.
9 
10     Redistribution and use in source and binary forms, with or without
11     modification, are permitted provided that the following conditions
12     are met:
13 
14     1. Redistributions of source code must retain the above copyright
15        notice, this list of conditions and the following disclaimer.
16 
17     2. Redistributions in binary form must reproduce the above copyright
18        notice, this list of conditions and the following disclaimer in
19        the documentation and/or other materials provided with the
20        distribution.
21 
22     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33     POSSIBILITY OF SUCH DAMAGE.
34 */
35 
36 #include <config.h>
37 #include <SWI-Stream.h>			/* encoding */
38 #include <SWI-Prolog.h>
39 #include <stdlib.h>
40 #ifdef HAVE_MALLOC_H
41 #include <malloc.h>
42 #endif
43 #include "error.h"
44 #include <errno.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <wctype.h>
48 #include "xml_unicode.h"
49 #include "dtd.h"
50 #ifdef __WINDOWS__
51 #define inline __inline
52 #endif
53 
54 static atom_t ATOM_iso_latin_1;
55 static atom_t ATOM_utf8;
56 static atom_t ATOM_unicode;
57 static atom_t ATOM_ascii;
58 
59 #define CHARSET 256
60 
61 typedef struct charbuf
62 { char     buffer[1024];
63   char    *bufp;
64   char    *end;
65   size_t   size;
66 } charbuf;
67 
68 
69 static void
init_buf(charbuf * b)70 init_buf(charbuf *b)
71 { b->bufp = b->end = b->buffer;
72   b->size = sizeof(b->buffer);
73 }
74 
75 
76 static void
free_buf(charbuf * b)77 free_buf(charbuf *b)
78 { if ( b->bufp != b->buffer )
79     free(b->bufp);
80 }
81 
82 
83 static int
room_buf(charbuf * b,size_t room)84 room_buf(charbuf *b, size_t room)
85 { size_t used = b->end - b->bufp;
86 
87   if ( room + used > b->size )
88   { if ( b->bufp == b->buffer )
89     { b->size = sizeof(b->buffer)*2;
90       if ( !(b->bufp = malloc(b->size)) )
91 	return sgml2pl_error(ERR_ERRNO);
92 
93       memcpy(b->bufp, b->buffer, used);
94     } else
95     { char *ptr;
96 
97       b->size *= 2;
98       if ( !(ptr = realloc(b->bufp, b->size)) )
99 	return sgml2pl_error(ERR_ERRNO);
100       b->bufp = ptr;
101     }
102     b->end = b->bufp + used;
103   }
104 
105   return TRUE;
106 }
107 
108 
109 static size_t
used_buf(const charbuf * b)110 used_buf(const charbuf *b)
111 { return b->end - b->bufp;
112 }
113 
114 
115 static int
add_char_buf(charbuf * b,int chr)116 add_char_buf(charbuf *b, int chr)
117 { if ( room_buf(b, 1) )
118   { *b->end++ = chr;
119 
120     return TRUE;
121   }
122 
123   return FALSE;
124 }
125 
126 
127 static int
add_char_bufW(charbuf * b,int chr)128 add_char_bufW(charbuf *b, int chr)
129 { if ( room_buf(b, sizeof(wchar_t)) )
130   { wchar_t *p = (wchar_t*)b->end;
131 
132     *p++ = chr;
133     b->end = (char *)p;
134 
135     return TRUE;
136   }
137 
138   return FALSE;
139 }
140 
141 
142 static int
add_str_buf(charbuf * b,const char * s)143 add_str_buf(charbuf *b, const char *s)
144 { size_t len = strlen(s);
145 
146   if ( room_buf(b, len+1) )
147   { memcpy(b->end, s, len+1);
148     b->end += len;
149 
150     return TRUE;
151   }
152 
153   return FALSE;
154 }
155 
156 
157 static int
add_str_bufW(charbuf * b,const char * s)158 add_str_bufW(charbuf *b, const char *s)
159 { size_t len = strlen(s);
160 
161   if ( room_buf(b, len*sizeof(wchar_t)) )
162   { wchar_t *p = (wchar_t*)b->end;
163 
164     while(*s)
165       *p++ = *s++;
166     b->end = (char *)p;
167 
168     return TRUE;
169   }
170 
171   return FALSE;
172 }
173 
174 
175 
176 static foreign_t
do_quote(term_t in,term_t quoted,char ** map,int maxchr)177 do_quote(term_t in, term_t quoted, char **map, int maxchr)
178 { char *inA = NULL;
179   wchar_t *inW = NULL;
180   size_t len;
181   const unsigned  char *s;
182   charbuf buffer;
183   int changes = 0;
184   int rc;
185 
186   if ( !PL_get_nchars(in, &len, &inA, CVT_ATOMIC) &&
187        !PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
188     return sgml2pl_error(ERR_TYPE, "atom", in);
189   if ( len == 0 )
190     return PL_unify(in, quoted);
191 
192   init_buf(&buffer);
193 
194   if ( inA )
195   { for(s = (unsigned char*)inA ; len-- > 0; s++ )
196     { int c = *s;
197 
198       if ( map[c] )
199       { if ( !add_str_buf(&buffer, map[c]) )
200 	  return FALSE;
201 
202 	changes++;
203       } else if ( c > maxchr )
204       { char buf[20];
205 
206 	sprintf(buf, "&#%d;", c);
207 	if ( !add_str_buf(&buffer, buf) )
208 	  return FALSE;
209 
210 	changes++;
211       } else
212       { add_char_buf(&buffer, c);
213       }
214     }
215 
216     if ( changes > 0 )
217       rc = PL_unify_atom_nchars(quoted, used_buf(&buffer), buffer.bufp);
218     else
219       rc = PL_unify(in, quoted);
220   } else
221   { for( ; len-- > 0; inW++ )
222     { int c = *inW;
223 
224       if ( c <= 0xff && map[c] )
225       { if ( !add_str_bufW(&buffer, map[c]) )
226 	  return FALSE;
227 
228 	changes++;
229       } else if ( c > maxchr )
230       { char buf[20];
231 
232 	sprintf(buf, "&#%d;", c);
233 	if ( !add_str_bufW(&buffer, buf) )
234 	  return FALSE;
235 
236 	changes++;
237       }else
238       { add_char_bufW(&buffer, c);
239       }
240     }
241 
242     if ( changes > 0 )
243       rc = PL_unify_wchars(quoted, PL_ATOM,
244 			   used_buf(&buffer)/sizeof(wchar_t),
245 			   (wchar_t*)buffer.bufp);
246     else
247       rc = PL_unify(in, quoted);
248   }
249 
250   free_buf(&buffer);
251 
252   return rc;
253 }
254 
255 
256 static int
get_max_chr(term_t t,int * maxchr)257 get_max_chr(term_t t, int *maxchr)
258 { atom_t a;
259 
260   if ( PL_get_atom(t, &a) )
261   { if ( a == ATOM_iso_latin_1 )
262       *maxchr = 0xff;
263     else if ( a == ATOM_utf8 )
264       *maxchr = 0x7ffffff;
265     else if ( a == ATOM_unicode )
266       *maxchr = 0xffff;
267     else if ( a == ATOM_ascii )
268       *maxchr = 0x7f;
269     else
270       return sgml2pl_error(ERR_DOMAIN, "encoding", t);
271 
272     return TRUE;
273   }
274 
275   return sgml2pl_error(ERR_TYPE, "atom", t);
276 }
277 
278 
279 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
280 (*) xml_quote_attribute/3 assumes the attribute is   quoted using "" and
281 does *not* escape '. Although escaping ' with &apos; is valid XML, it is
282 *not* valid html, and this  routine  is   also  used  by  the html_write
283 library.
284 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
285 
286 static foreign_t
xml_quote_attribute(term_t in,term_t out,term_t encoding)287 xml_quote_attribute(term_t in, term_t out, term_t encoding)
288 { static char **map;
289   int maxchr;
290 
291   if ( !map )
292   { int i;
293 
294     if ( !(map = malloc(CHARSET*sizeof(char*))) )
295       return sgml2pl_error(ERR_ERRNO, errno);
296 
297     for(i=0; i<CHARSET; i++)
298       map[i] = NULL;
299 
300     map['<']  = "&lt;";
301     map['>']  = "&gt;";
302     map['&']  = "&amp;";
303 /*  map['\''] = "&apos;"; See (*) */
304     map['"']  = "&quot;";
305   }
306 
307   if ( !get_max_chr(encoding, &maxchr) )
308     return FALSE;
309 
310   return do_quote(in, out, map, maxchr);
311 }
312 
313 
314 static foreign_t
xml_quote_cdata(term_t in,term_t out,term_t encoding)315 xml_quote_cdata(term_t in, term_t out, term_t encoding)
316 { static char **map;
317   int maxchr;
318 
319   if ( !map )
320   { int i;
321 
322     if ( !(map = malloc(CHARSET*sizeof(char*))) )
323       return sgml2pl_error(ERR_ERRNO, errno);
324 
325     for(i=0; i<CHARSET; i++)
326       map[i] = NULL;
327 
328     map['<']  = "&lt;";
329     map['>']  = "&gt;";
330     map['&']  = "&amp;";
331   }
332 
333   if ( !get_max_chr(encoding, &maxchr) )
334     return FALSE;
335 
336   return do_quote(in, out, map, maxchr);
337 }
338 
339 
340 static inline int
is_xml_nmstart(dtd_charclass * map,int c)341 is_xml_nmstart(dtd_charclass *map, int c)
342 { if ( c <= 0xff )
343   { return (map->class[c] & CH_NMSTART);
344   } else
345   { return ( xml_basechar(c) ||
346 	     xml_ideographic(c)
347 	   );
348   }
349 }
350 
351 
352 static inline int
is_xml_chname(dtd_charclass * map,int c)353 is_xml_chname(dtd_charclass *map, int c)
354 { if ( c <= 0xff )
355   { return (map->class[c] & CH_NAME);
356   } else
357   { return ( xml_basechar(c) ||
358 	     xml_digit(c) ||
359 	     xml_ideographic(c) ||
360 	     xml_combining_char(c) ||
361 	     xml_extender(c)
362 	   );
363   }
364 }
365 
366 static dtd_charclass *map;
367 
368 static foreign_t
xml_name(term_t in,term_t encoding)369 xml_name(term_t in, term_t encoding)
370 { char *ins;
371   wchar_t *inW;
372   size_t len;
373   unsigned int i;
374   int maxchr;
375 
376   if ( !get_max_chr(encoding, &maxchr) )
377     return FALSE;
378 
379   if ( !map )
380     map = new_charclass();
381 
382   if ( PL_get_nchars(in, &len, &ins, CVT_ATOMIC) )
383   { int c;
384 
385     if ( len == 0 )
386       return FALSE;
387 
388     c = ins[0] & 0xff;
389     if ( c > maxchr )
390       return FALSE;
391 
392     if ( !(map->class[c] & CH_NMSTART) )
393       return FALSE;
394     for(i=1; i<len; i++)
395     { c = ins[i] & 0xff;
396 
397       if ( c > maxchr || !(map->class[c] & CH_NAME) )
398 	return FALSE;
399     }
400 
401     return TRUE;
402   }
403   if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
404   { if ( len == 0 )
405       return FALSE;
406 
407     if ( inW[0] > maxchr ||
408 	 !is_xml_nmstart(map, inW[0]) )
409       return FALSE;
410 
411     for(i=1; i<len; i++)
412     { int c = inW[i];
413 
414       if ( c > maxchr ||
415 	   !is_xml_chname(map, c) )
416 	return FALSE;
417     }
418 
419     return TRUE;
420   }
421 
422   return FALSE;
423 }
424 
425 
426 static foreign_t
iri_xml_namespace(term_t iri,term_t namespace,term_t localname)427 iri_xml_namespace(term_t iri, term_t namespace, term_t localname)
428 { char *s;
429   pl_wchar_t *w;
430   size_t len;
431 
432   if ( !map )
433     map = new_charclass();
434 
435   if ( PL_get_nchars(iri, &len, &s, CVT_ATOM|CVT_STRING) )
436   { const char *e = &s[len];
437     const char *p = e;
438 
439     while(p>s && (map->class[p[-1]&0xff] & CH_NAME))
440       p--;
441     while(p<e && !(map->class[p[0]&0xff] & CH_NMSTART))
442       p++;
443 
444     if ( !PL_unify_atom_nchars(namespace, p-s, s) )
445       return FALSE;
446     if ( localname &&
447 	 !PL_unify_atom_nchars(localname, e-p, p) )
448       return FALSE;
449 
450     return TRUE;
451   } else if ( PL_get_wchars(iri, &len, &w, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
452   { const pl_wchar_t *e = &w[len];
453     const pl_wchar_t *p = e;
454 
455     while(p>w && is_xml_chname(map, p[-1]) )
456       p--;
457     while(p<e && !is_xml_nmstart(map, p[0]) )
458       p++;
459 
460     if ( !PL_unify_wchars(namespace, PL_ATOM, p-w, w) )
461       return FALSE;
462     if ( localname &&
463 	 !PL_unify_wchars(localname, PL_ATOM, e-p, p) )
464       return FALSE;
465 
466     return TRUE;
467   }
468 
469   return FALSE;
470 }
471 
472 
473 static foreign_t
iri_xml_namespace2(term_t iri,term_t namespace)474 iri_xml_namespace2(term_t iri, term_t namespace)
475 { return iri_xml_namespace(iri, namespace, 0);
476 }
477 
478 
479 static foreign_t
pl_xml_basechar(term_t t)480 pl_xml_basechar(term_t t)
481 { int c;
482 
483   if ( PL_get_char_ex(t, &c, FALSE) &&
484        xml_basechar(c) )
485     return TRUE;
486 
487   return FALSE;
488 }
489 
490 
491 static foreign_t
pl_xml_ideographic(term_t t)492 pl_xml_ideographic(term_t t)
493 { int c;
494 
495   if ( PL_get_char_ex(t, &c, FALSE) &&
496        xml_ideographic(c) )
497     return TRUE;
498 
499   return FALSE;
500 }
501 
502 
503 static foreign_t
pl_xml_combining_char(term_t t)504 pl_xml_combining_char(term_t t)
505 { int c;
506 
507   if ( PL_get_char_ex(t, &c, FALSE) &&
508        xml_combining_char(c) )
509     return TRUE;
510 
511   return FALSE;
512 }
513 
514 
515 static foreign_t
pl_xml_digit(term_t t)516 pl_xml_digit(term_t t)
517 { int c;
518 
519   if ( PL_get_char_ex(t, &c, FALSE) &&
520        xml_digit(c) )
521     return TRUE;
522 
523   return FALSE;
524 }
525 
526 
527 static foreign_t
pl_xml_extender(term_t t)528 pl_xml_extender(term_t t)
529 { int c;
530 
531   if ( PL_get_char_ex(t, &c, FALSE) &&
532        xml_extender(c) )
533     return TRUE;
534 
535   return FALSE;
536 }
537 
538 
539 install_t
install_xml_quote()540 install_xml_quote()
541 { ATOM_iso_latin_1 = PL_new_atom("iso_latin_1");
542   ATOM_utf8        = PL_new_atom("utf8");
543   ATOM_unicode     = PL_new_atom("unicode");
544   ATOM_ascii       = PL_new_atom("ascii");
545 
546   PL_register_foreign("xml_quote_attribute", 3,	xml_quote_attribute,   0);
547   PL_register_foreign("xml_quote_cdata",     3,	xml_quote_cdata,       0);
548   PL_register_foreign("xml_name",	     2,	xml_name,	       0);
549   PL_register_foreign("xml_basechar",	     1,	pl_xml_basechar,       0);
550   PL_register_foreign("xml_ideographic",     1,	pl_xml_ideographic,    0);
551   PL_register_foreign("xml_combining_char",  1,	pl_xml_combining_char, 0);
552   PL_register_foreign("xml_digit",	     1,	pl_xml_digit,	       0);
553   PL_register_foreign("xml_extender",	     1,	pl_xml_extender,       0);
554   PL_register_foreign("iri_xml_namespace",   3,	iri_xml_namespace,     0);
555   PL_register_foreign("iri_xml_namespace",   2,	iri_xml_namespace2,    0);
556 }
557