1 /* Part of SWI-Prolog
2
3 Author: Jan Wielemaker
4 E-mail: J.Wielemaker@vu.nl
5 WWW: http://www.swi-prolog.org
6 Copyright (c) 2002-2015, University of Amsterdam
7 VU University Amsterdam
8 All rights reserved.
9
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions
12 are met:
13
14 1. Redistributions of source code must retain the above copyright
15 notice, this list of conditions and the following disclaimer.
16
17 2. Redistributions in binary form must reproduce the above copyright
18 notice, this list of conditions and the following disclaimer in
19 the documentation and/or other materials provided with the
20 distribution.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include <config.h>
37 #include <SWI-Stream.h> /* encoding */
38 #include <SWI-Prolog.h>
39 #include <stdlib.h>
40 #ifdef HAVE_MALLOC_H
41 #include <malloc.h>
42 #endif
43 #include "error.h"
44 #include <errno.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <wctype.h>
48 #include "xml_unicode.h"
49 #include "dtd.h"
50 #ifdef __WINDOWS__
51 #define inline __inline
52 #endif
53
54 static atom_t ATOM_iso_latin_1;
55 static atom_t ATOM_utf8;
56 static atom_t ATOM_unicode;
57 static atom_t ATOM_ascii;
58
59 #define CHARSET 256
60
61 typedef struct charbuf
62 { char buffer[1024];
63 char *bufp;
64 char *end;
65 size_t size;
66 } charbuf;
67
68
69 static void
init_buf(charbuf * b)70 init_buf(charbuf *b)
71 { b->bufp = b->end = b->buffer;
72 b->size = sizeof(b->buffer);
73 }
74
75
76 static void
free_buf(charbuf * b)77 free_buf(charbuf *b)
78 { if ( b->bufp != b->buffer )
79 free(b->bufp);
80 }
81
82
83 static int
room_buf(charbuf * b,size_t room)84 room_buf(charbuf *b, size_t room)
85 { size_t used = b->end - b->bufp;
86
87 if ( room + used > b->size )
88 { if ( b->bufp == b->buffer )
89 { b->size = sizeof(b->buffer)*2;
90 if ( !(b->bufp = malloc(b->size)) )
91 return sgml2pl_error(ERR_ERRNO);
92
93 memcpy(b->bufp, b->buffer, used);
94 } else
95 { char *ptr;
96
97 b->size *= 2;
98 if ( !(ptr = realloc(b->bufp, b->size)) )
99 return sgml2pl_error(ERR_ERRNO);
100 b->bufp = ptr;
101 }
102 b->end = b->bufp + used;
103 }
104
105 return TRUE;
106 }
107
108
109 static size_t
used_buf(const charbuf * b)110 used_buf(const charbuf *b)
111 { return b->end - b->bufp;
112 }
113
114
115 static int
add_char_buf(charbuf * b,int chr)116 add_char_buf(charbuf *b, int chr)
117 { if ( room_buf(b, 1) )
118 { *b->end++ = chr;
119
120 return TRUE;
121 }
122
123 return FALSE;
124 }
125
126
127 static int
add_char_bufW(charbuf * b,int chr)128 add_char_bufW(charbuf *b, int chr)
129 { if ( room_buf(b, sizeof(wchar_t)) )
130 { wchar_t *p = (wchar_t*)b->end;
131
132 *p++ = chr;
133 b->end = (char *)p;
134
135 return TRUE;
136 }
137
138 return FALSE;
139 }
140
141
142 static int
add_str_buf(charbuf * b,const char * s)143 add_str_buf(charbuf *b, const char *s)
144 { size_t len = strlen(s);
145
146 if ( room_buf(b, len+1) )
147 { memcpy(b->end, s, len+1);
148 b->end += len;
149
150 return TRUE;
151 }
152
153 return FALSE;
154 }
155
156
157 static int
add_str_bufW(charbuf * b,const char * s)158 add_str_bufW(charbuf *b, const char *s)
159 { size_t len = strlen(s);
160
161 if ( room_buf(b, len*sizeof(wchar_t)) )
162 { wchar_t *p = (wchar_t*)b->end;
163
164 while(*s)
165 *p++ = *s++;
166 b->end = (char *)p;
167
168 return TRUE;
169 }
170
171 return FALSE;
172 }
173
174
175
176 static foreign_t
do_quote(term_t in,term_t quoted,char ** map,int maxchr)177 do_quote(term_t in, term_t quoted, char **map, int maxchr)
178 { char *inA = NULL;
179 wchar_t *inW = NULL;
180 size_t len;
181 const unsigned char *s;
182 charbuf buffer;
183 int changes = 0;
184 int rc;
185
186 if ( !PL_get_nchars(in, &len, &inA, CVT_ATOMIC) &&
187 !PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
188 return sgml2pl_error(ERR_TYPE, "atom", in);
189 if ( len == 0 )
190 return PL_unify(in, quoted);
191
192 init_buf(&buffer);
193
194 if ( inA )
195 { for(s = (unsigned char*)inA ; len-- > 0; s++ )
196 { int c = *s;
197
198 if ( map[c] )
199 { if ( !add_str_buf(&buffer, map[c]) )
200 return FALSE;
201
202 changes++;
203 } else if ( c > maxchr )
204 { char buf[20];
205
206 sprintf(buf, "&#%d;", c);
207 if ( !add_str_buf(&buffer, buf) )
208 return FALSE;
209
210 changes++;
211 } else
212 { add_char_buf(&buffer, c);
213 }
214 }
215
216 if ( changes > 0 )
217 rc = PL_unify_atom_nchars(quoted, used_buf(&buffer), buffer.bufp);
218 else
219 rc = PL_unify(in, quoted);
220 } else
221 { for( ; len-- > 0; inW++ )
222 { int c = *inW;
223
224 if ( c <= 0xff && map[c] )
225 { if ( !add_str_bufW(&buffer, map[c]) )
226 return FALSE;
227
228 changes++;
229 } else if ( c > maxchr )
230 { char buf[20];
231
232 sprintf(buf, "&#%d;", c);
233 if ( !add_str_bufW(&buffer, buf) )
234 return FALSE;
235
236 changes++;
237 }else
238 { add_char_bufW(&buffer, c);
239 }
240 }
241
242 if ( changes > 0 )
243 rc = PL_unify_wchars(quoted, PL_ATOM,
244 used_buf(&buffer)/sizeof(wchar_t),
245 (wchar_t*)buffer.bufp);
246 else
247 rc = PL_unify(in, quoted);
248 }
249
250 free_buf(&buffer);
251
252 return rc;
253 }
254
255
256 static int
get_max_chr(term_t t,int * maxchr)257 get_max_chr(term_t t, int *maxchr)
258 { atom_t a;
259
260 if ( PL_get_atom(t, &a) )
261 { if ( a == ATOM_iso_latin_1 )
262 *maxchr = 0xff;
263 else if ( a == ATOM_utf8 )
264 *maxchr = 0x7ffffff;
265 else if ( a == ATOM_unicode )
266 *maxchr = 0xffff;
267 else if ( a == ATOM_ascii )
268 *maxchr = 0x7f;
269 else
270 return sgml2pl_error(ERR_DOMAIN, "encoding", t);
271
272 return TRUE;
273 }
274
275 return sgml2pl_error(ERR_TYPE, "atom", t);
276 }
277
278
279 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
280 (*) xml_quote_attribute/3 assumes the attribute is quoted using "" and
281 does *not* escape '. Although escaping ' with ' is valid XML, it is
282 *not* valid html, and this routine is also used by the html_write
283 library.
284 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
285
286 static foreign_t
xml_quote_attribute(term_t in,term_t out,term_t encoding)287 xml_quote_attribute(term_t in, term_t out, term_t encoding)
288 { static char **map;
289 int maxchr;
290
291 if ( !map )
292 { int i;
293
294 if ( !(map = malloc(CHARSET*sizeof(char*))) )
295 return sgml2pl_error(ERR_ERRNO, errno);
296
297 for(i=0; i<CHARSET; i++)
298 map[i] = NULL;
299
300 map['<'] = "<";
301 map['>'] = ">";
302 map['&'] = "&";
303 /* map['\''] = "'"; See (*) */
304 map['"'] = """;
305 }
306
307 if ( !get_max_chr(encoding, &maxchr) )
308 return FALSE;
309
310 return do_quote(in, out, map, maxchr);
311 }
312
313
314 static foreign_t
xml_quote_cdata(term_t in,term_t out,term_t encoding)315 xml_quote_cdata(term_t in, term_t out, term_t encoding)
316 { static char **map;
317 int maxchr;
318
319 if ( !map )
320 { int i;
321
322 if ( !(map = malloc(CHARSET*sizeof(char*))) )
323 return sgml2pl_error(ERR_ERRNO, errno);
324
325 for(i=0; i<CHARSET; i++)
326 map[i] = NULL;
327
328 map['<'] = "<";
329 map['>'] = ">";
330 map['&'] = "&";
331 }
332
333 if ( !get_max_chr(encoding, &maxchr) )
334 return FALSE;
335
336 return do_quote(in, out, map, maxchr);
337 }
338
339
340 static inline int
is_xml_nmstart(dtd_charclass * map,int c)341 is_xml_nmstart(dtd_charclass *map, int c)
342 { if ( c <= 0xff )
343 { return (map->class[c] & CH_NMSTART);
344 } else
345 { return ( xml_basechar(c) ||
346 xml_ideographic(c)
347 );
348 }
349 }
350
351
352 static inline int
is_xml_chname(dtd_charclass * map,int c)353 is_xml_chname(dtd_charclass *map, int c)
354 { if ( c <= 0xff )
355 { return (map->class[c] & CH_NAME);
356 } else
357 { return ( xml_basechar(c) ||
358 xml_digit(c) ||
359 xml_ideographic(c) ||
360 xml_combining_char(c) ||
361 xml_extender(c)
362 );
363 }
364 }
365
366 static dtd_charclass *map;
367
368 static foreign_t
xml_name(term_t in,term_t encoding)369 xml_name(term_t in, term_t encoding)
370 { char *ins;
371 wchar_t *inW;
372 size_t len;
373 unsigned int i;
374 int maxchr;
375
376 if ( !get_max_chr(encoding, &maxchr) )
377 return FALSE;
378
379 if ( !map )
380 map = new_charclass();
381
382 if ( PL_get_nchars(in, &len, &ins, CVT_ATOMIC) )
383 { int c;
384
385 if ( len == 0 )
386 return FALSE;
387
388 c = ins[0] & 0xff;
389 if ( c > maxchr )
390 return FALSE;
391
392 if ( !(map->class[c] & CH_NMSTART) )
393 return FALSE;
394 for(i=1; i<len; i++)
395 { c = ins[i] & 0xff;
396
397 if ( c > maxchr || !(map->class[c] & CH_NAME) )
398 return FALSE;
399 }
400
401 return TRUE;
402 }
403 if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
404 { if ( len == 0 )
405 return FALSE;
406
407 if ( inW[0] > maxchr ||
408 !is_xml_nmstart(map, inW[0]) )
409 return FALSE;
410
411 for(i=1; i<len; i++)
412 { int c = inW[i];
413
414 if ( c > maxchr ||
415 !is_xml_chname(map, c) )
416 return FALSE;
417 }
418
419 return TRUE;
420 }
421
422 return FALSE;
423 }
424
425
426 static foreign_t
iri_xml_namespace(term_t iri,term_t namespace,term_t localname)427 iri_xml_namespace(term_t iri, term_t namespace, term_t localname)
428 { char *s;
429 pl_wchar_t *w;
430 size_t len;
431
432 if ( !map )
433 map = new_charclass();
434
435 if ( PL_get_nchars(iri, &len, &s, CVT_ATOM|CVT_STRING) )
436 { const char *e = &s[len];
437 const char *p = e;
438
439 while(p>s && (map->class[p[-1]&0xff] & CH_NAME))
440 p--;
441 while(p<e && !(map->class[p[0]&0xff] & CH_NMSTART))
442 p++;
443
444 if ( !PL_unify_atom_nchars(namespace, p-s, s) )
445 return FALSE;
446 if ( localname &&
447 !PL_unify_atom_nchars(localname, e-p, p) )
448 return FALSE;
449
450 return TRUE;
451 } else if ( PL_get_wchars(iri, &len, &w, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
452 { const pl_wchar_t *e = &w[len];
453 const pl_wchar_t *p = e;
454
455 while(p>w && is_xml_chname(map, p[-1]) )
456 p--;
457 while(p<e && !is_xml_nmstart(map, p[0]) )
458 p++;
459
460 if ( !PL_unify_wchars(namespace, PL_ATOM, p-w, w) )
461 return FALSE;
462 if ( localname &&
463 !PL_unify_wchars(localname, PL_ATOM, e-p, p) )
464 return FALSE;
465
466 return TRUE;
467 }
468
469 return FALSE;
470 }
471
472
473 static foreign_t
iri_xml_namespace2(term_t iri,term_t namespace)474 iri_xml_namespace2(term_t iri, term_t namespace)
475 { return iri_xml_namespace(iri, namespace, 0);
476 }
477
478
479 static foreign_t
pl_xml_basechar(term_t t)480 pl_xml_basechar(term_t t)
481 { int c;
482
483 if ( PL_get_char_ex(t, &c, FALSE) &&
484 xml_basechar(c) )
485 return TRUE;
486
487 return FALSE;
488 }
489
490
491 static foreign_t
pl_xml_ideographic(term_t t)492 pl_xml_ideographic(term_t t)
493 { int c;
494
495 if ( PL_get_char_ex(t, &c, FALSE) &&
496 xml_ideographic(c) )
497 return TRUE;
498
499 return FALSE;
500 }
501
502
503 static foreign_t
pl_xml_combining_char(term_t t)504 pl_xml_combining_char(term_t t)
505 { int c;
506
507 if ( PL_get_char_ex(t, &c, FALSE) &&
508 xml_combining_char(c) )
509 return TRUE;
510
511 return FALSE;
512 }
513
514
515 static foreign_t
pl_xml_digit(term_t t)516 pl_xml_digit(term_t t)
517 { int c;
518
519 if ( PL_get_char_ex(t, &c, FALSE) &&
520 xml_digit(c) )
521 return TRUE;
522
523 return FALSE;
524 }
525
526
527 static foreign_t
pl_xml_extender(term_t t)528 pl_xml_extender(term_t t)
529 { int c;
530
531 if ( PL_get_char_ex(t, &c, FALSE) &&
532 xml_extender(c) )
533 return TRUE;
534
535 return FALSE;
536 }
537
538
539 install_t
install_xml_quote()540 install_xml_quote()
541 { ATOM_iso_latin_1 = PL_new_atom("iso_latin_1");
542 ATOM_utf8 = PL_new_atom("utf8");
543 ATOM_unicode = PL_new_atom("unicode");
544 ATOM_ascii = PL_new_atom("ascii");
545
546 PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);
547 PL_register_foreign("xml_quote_cdata", 3, xml_quote_cdata, 0);
548 PL_register_foreign("xml_name", 2, xml_name, 0);
549 PL_register_foreign("xml_basechar", 1, pl_xml_basechar, 0);
550 PL_register_foreign("xml_ideographic", 1, pl_xml_ideographic, 0);
551 PL_register_foreign("xml_combining_char", 1, pl_xml_combining_char, 0);
552 PL_register_foreign("xml_digit", 1, pl_xml_digit, 0);
553 PL_register_foreign("xml_extender", 1, pl_xml_extender, 0);
554 PL_register_foreign("iri_xml_namespace", 3, iri_xml_namespace, 0);
555 PL_register_foreign("iri_xml_namespace", 2, iri_xml_namespace2, 0);
556 }
557