1 /* Part of SWI-Prolog
2
3 Author: Jan Wielemaker
4 E-mail: J.Wielemaker@vu.nl
5 WWW: http://www.swi-prolog.org
6 Copyright (c) 2006-2016, University of Amsterdam
7 VU University Amsterdam
8 All rights reserved.
9
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions
12 are met:
13
14 1. Redistributions of source code must retain the above copyright
15 notice, this list of conditions and the following disclaimer.
16
17 2. Redistributions in binary form must reproduce the above copyright
18 notice, this list of conditions and the following disclaimer in
19 the documentation and/or other materials provided with the
20 distribution.
21
22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include <config.h>
37 #include <SWI-Stream.h>
38 #include <SWI-Prolog.h>
39 #include "rdf_db.h"
40 #include "atom.h"
41 #include "murmur.h"
42 #include <wchar.h>
43 #include <wctype.h>
44 #include <assert.h>
45
46 #ifdef __WINDOWS__
47 #define inline __inline
48 #endif
49
50 #include "unicode_map.c"
51
52
53 /*******************************
54 * TEXT HANDLING *
55 *******************************/
56
57 int
fetch_atom_text(atom_t atom,text * txt)58 fetch_atom_text(atom_t atom, text *txt)
59 { if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
60 { txt->w = NULL;
61 return TRUE;
62 }
63 if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
64 { txt->a = NULL;
65 return TRUE;
66 }
67
68 return FALSE;
69 }
70
71
72 static inline wint_t
fetch(const text * txt,int i)73 fetch(const text *txt, int i)
74 { return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
75 }
76
77
78 int
fill_atom_info(atom_info * info)79 fill_atom_info(atom_info *info)
80 { if ( !info->resolved )
81 { info->resolved = TRUE;
82
83 if ( !(info->rc=fetch_atom_text(info->handle, &info->text)) )
84 { info->text.a = NULL;
85 info->text.w = NULL;
86 }
87 }
88
89 return info->rc;
90 }
91
92
93 /*******************************
94 * COMPARE *
95 *******************************/
96
97 static inline int
cmpA(int c1,int c2,int * dl2)98 cmpA(int c1, int c2, int *dl2)
99 { if ( c1 == c2 )
100 { return 0;
101 } else
102 { int k1 = sort_pointA(c1);
103 int k2 = sort_pointA(c2);
104 int d;
105
106 if ( (d=((k1>>8)-(k2>>8))) == 0 )
107 { if ( *dl2 == 0 )
108 *dl2 = (k1&0xff) - (k2&0xff);
109 }
110
111 return d;
112 }
113 }
114
115
116 static inline int
cmpW(int c1,int c2,int * dl2)117 cmpW(int c1, int c2, int *dl2)
118 { if ( c1 == c2 )
119 { return 0;
120 } else
121 { int k1 = sort_point(c1);
122 int k2 = sort_point(c2);
123 int d;
124
125 if ( (d=((k1>>8)-(k2>>8))) == 0 )
126 { if ( *dl2 == 0 )
127 *dl2 = (k1&0xff) - (k2&0xff);
128 }
129
130 return d;
131 }
132 }
133
134
135 int
cmp_atom_info(atom_info * info,atom_t a2)136 cmp_atom_info(atom_info *info, atom_t a2)
137 { text t2;
138 int i;
139 int dl2 = 0;
140 size_t n;
141
142 if ( info->handle == a2 )
143 return 0;
144
145 if ( !fill_atom_info(info) ||
146 !fetch_atom_text(a2, &t2) )
147 { goto cmphandles; /* non-text atoms? */
148 }
149
150 if ( info->text.a && t2.a )
151 { const charA *s1 = info->text.a;
152 const charA *s2 = t2.a;
153 int d;
154
155 while((d=cmpA(*s1, *s2, &dl2)) == 0)
156 { if ( *s1 == 0 )
157 goto eq;
158 s1++, s2++;
159 }
160 return d;
161 }
162
163 n = (info->text.length < t2.length ? info->text.length : t2.length);
164
165 if ( info->text.w && t2.w )
166 { const charW *s1 = info->text.w;
167 const charW *s2 = t2.w;
168
169 for(;;s1++, s2++)
170 { if ( n-- == 0 )
171 { if ( info->text.length == t2.length )
172 goto eq;
173
174 return info->text.length < t2.length ? -1 : 1;
175 } else
176 { int d;
177
178 if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
179 return d;
180 }
181 }
182 }
183
184 for(i=0; ; i++)
185 { if ( n-- == 0 )
186 { if ( info->text.length == t2.length )
187 goto eq;
188
189 return info->text.length < t2.length ? -1 : 1;
190 } else
191 { wint_t c1 = fetch(&info->text, i);
192 wint_t c2 = fetch(&t2, i);
193 int d;
194
195 if ( (d=cmpW(c1, c2, &dl2)) != 0 )
196 return d;
197 }
198 }
199
200 eq:
201 if ( dl2 )
202 return dl2;
203
204 cmphandles:
205 return info->handle < a2 ? -1 : 1; /* == already covered */
206 }
207
208
209 int
cmp_atoms(atom_t a1,atom_t a2)210 cmp_atoms(atom_t a1, atom_t a2)
211 { atom_info info = {0};
212
213 if ( a1 == a2 )
214 return 0;
215
216 info.handle = a1;
217
218 return cmp_atom_info(&info, a2);
219 }
220
221
222 /*******************************
223 * HASH *
224 *******************************/
225
226 static unsigned int
string_hashA(const char * s,size_t len)227 string_hashA(const char *s, size_t len)
228 { const unsigned char *t = (const unsigned char *)s;
229 unsigned int hash = 0;
230
231 while( len>0 )
232 { unsigned char buf[256];
233 unsigned char *o = buf-1;
234 int cp = len > 256 ? 256 : (int)len;
235 const unsigned char *e = t+cp;
236
237 t--;
238 while(++t<e)
239 *++o = sort_pointA(*t)>>8;
240 hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
241
242 len -= cp;
243 }
244
245 return hash;
246 }
247
248
249 static unsigned int
string_hashW(const wchar_t * t,size_t len)250 string_hashW(const wchar_t *t, size_t len)
251 { unsigned int hash = 0;
252
253 while( len>0 )
254 { unsigned short buf[256];
255 unsigned short *o = buf;
256 int cp = len > 256 ? 256 : (int)len;
257 const wchar_t *e = t+cp;
258
259 while(t<e)
260 *o++ = (short)(sort_point(*t++)>>8);
261 hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
262
263 len -= cp;
264 }
265
266 return hash;
267 }
268
269
270 unsigned int
atom_hash_case(atom_t a)271 atom_hash_case(atom_t a)
272 { const char *s;
273 const wchar_t *w;
274 size_t len;
275
276 if ( (s = PL_atom_nchars(a, &len)) )
277 return string_hashA(s, len);
278 else if ( (w = PL_atom_wchars(a, &len)) )
279 return string_hashW(w, len);
280 else
281 { assert(0);
282 return 0;
283 }
284 }
285
286
287 /*******************************
288 * FIND FIRST *
289 *******************************/
290
291 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
292 Given an atom, return a new one that has all its characters modified
293 such that it appears first in the set of atoms considered equal after
294 case canonisation and diacritics removal. This is required for prefix
295 search to find the first atom of the set.
296 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
297
298 atom_t
first_atom(atom_t a,int match)299 first_atom(atom_t a, int match)
300 { text t;
301
302 if ( !fetch_atom_text(a, &t) )
303 { return (atom_t)0; /* not a textual atom */
304 } else
305 { size_t len = t.length;
306 wchar_t buf[256];
307 wchar_t *out, *s;
308 int i;
309 atom_t rc;
310
311 if ( len <= 256 )
312 out = buf;
313 else
314 out = PL_malloc(len*sizeof(wchar_t));
315
316 for(s=out,i=0; i<len; s++,i++)
317 { wint_t c = fetch(&t,i);
318
319 if ( c == '*' && match == STR_MATCH_LIKE )
320 { if ( i == 0 ) /* like '*...' */
321 { rc = 0;
322 goto out;
323 }
324 len = i; /* only up to the first * */
325 break;
326 }
327 *s = sort_point(c)>>8;
328 }
329
330 rc = PL_new_atom_wchars(len, out);
331
332 out:
333 if ( out != buf )
334 PL_free(out);
335
336 return rc;
337 }
338 }
339
340 /*******************************
341 * MATCH *
342 *******************************/
343
344 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
345 With the introduction of wide characters there are two versions of the
346 match() function, one using char* and one using a structure and index to
347 fetch characters. Overall performance of the first function is about
348 twice as good as the general one and as most data will be handled by
349 this function in practice I think it is worthwhile to have two
350 implementations. Both implementations are very similar in design and
351 likely to have the same bugs. If you find one, please fix it in both
352 branches!
353 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
354
355 static const charA *
nextwordA(const charA * s)356 nextwordA(const charA *s)
357 { while(*s && iswalnum(*s))
358 s++;
359 while(*s && !iswalnum(*s))
360 s++;
361
362 return s;
363 }
364
365
366 #define cmp_pointA(i) (sort_pointA(i)>>8)
367
368
369 static int
matchA(int how,const charA * f,const charA * l)370 matchA(int how, const charA *f, const charA *l)
371 { switch(how)
372 { case STR_MATCH_ICASE:
373 { for( ; *l && *f; l++, f++ )
374 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
375 return FALSE;
376 }
377 if ( *l == '\0' && *f == '\0' )
378 return TRUE;
379
380 return FALSE;
381 }
382 case STR_MATCH_PREFIX:
383 { for( ; *l && *f; l++, f++ )
384 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
385 return FALSE;
386 }
387 if ( *f == '\0' )
388 return TRUE;
389
390 return FALSE;
391 }
392 case STR_MATCH_SUBSTRING: /* use Boyle-More! */
393 { const charA *h;
394 const charA *f0 = f;
395
396 for(h=l; *h; h++)
397 { for( l=h,f=f0; *l && *f; l++, f++ )
398 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
399 break;
400 }
401 if ( *f == '\0' )
402 return TRUE;
403 if ( *h == '\0' )
404 return FALSE;
405 }
406
407 return FALSE;
408 }
409 case STR_MATCH_WORD:
410 { const charA *h;
411 const charA *f0 = f;
412
413 for(h=l; *h; h = nextwordA(h))
414 { for( l=h,f=f0; *l && *f; l++, f++ )
415 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
416 break;
417 }
418 if ( *f == '\0' )
419 { if ( *l == '\0' || !iswalnum(*l) )
420 return TRUE;
421 }
422 if ( *l == '\0' )
423 return FALSE;
424 }
425
426 return FALSE;
427 }
428 case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
429 { typedef struct chp { const charA *pattern;
430 const charA *label; } chp;
431 chp chps[MAX_LIKE_CHOICES];
432 int chn=0;
433
434 for( ; *l && *f; l++, f++ )
435 { if ( *f == '*' )
436 { f++;
437
438 if ( *f == '\0' ) /* foo* */
439 return TRUE;
440
441 search_like:
442 while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
443 l++;
444
445 if ( *l )
446 { if ( chn >= MAX_LIKE_CHOICES )
447 { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
448 MAX_LIKE_CHOICES);
449 return FALSE;
450 }
451 chps[chn].pattern = f;
452 chps[chn].label = l+1;
453 chn++;
454
455 continue;
456 } else
457 goto retry_like;
458 }
459
460 if ( cmp_pointA(*l) != cmp_pointA(*f) )
461 goto retry_like;
462 }
463 if ( *l == '\0' && (*f == '\0' ||
464 (*f == '*' && f[1] == '\0')) )
465 return TRUE;
466
467 retry_like:
468 if ( chn > 0 )
469 { chn--;
470 f = chps[chn].pattern;
471 l = chps[chn].label;
472 goto search_like;
473 }
474
475 return FALSE;
476 }
477 default:
478 assert(0);
479 return FALSE;
480 }
481 }
482
483
484 static unsigned int
nextword(text * txt,unsigned int i)485 nextword(text *txt, unsigned int i)
486 { while(i<txt->length && iswalnum(fetch(txt, i)))
487 i++;
488 while(i<txt->length && !iswalnum(fetch(txt, i)))
489 i++;
490
491 return i;
492 }
493
494
495 #define cmp_point(i) (sort_point(i)>>8)
496
497
498 int
match_atoms(int how,atom_t search,atom_t label)499 match_atoms(int how, atom_t search, atom_t label)
500 { text l, f;
501
502 if ( !fetch_atom_text(label, &l) ||
503 !fetch_atom_text(search, &f) )
504 return FALSE; /* error? */
505
506 return match_text(how, &f, &l);
507 }
508
509
510 int
match_text(int how,text * f,text * l)511 match_text(int how, text *f, text *l)
512 { if ( f->length == 0 )
513 return TRUE;
514
515 if ( f->a && l->a )
516 return matchA(how, f->a, l->a);
517
518 switch(how)
519 { case STR_MATCH_ICASE:
520 { if ( l->length == f->length )
521 { unsigned int i;
522
523 for(i=0; i<l->length; i++ )
524 { if ( cmp_point(fetch(l, i)) != cmp_point(fetch(f, i)) )
525 return FALSE;
526 }
527
528 return TRUE;
529 }
530
531 return FALSE;
532 }
533 case STR_MATCH_PREFIX:
534 { if ( f->length <= l->length )
535 { unsigned int i;
536
537 for(i=0; i<f->length; i++ )
538 { if ( cmp_point(fetch(l, i)) != cmp_point(fetch(f, i)) )
539 return FALSE;
540 }
541
542 return TRUE;
543 }
544
545 return FALSE;
546 }
547 case STR_MATCH_SUBSTRING: /* use Boyle-More! */
548 { if ( f->length <= l->length )
549 { unsigned int i, s;
550
551 for(s=0; s+f->length <= l->length; s++)
552 { for(i=0; i<f->length; i++)
553 { if ( cmp_point(fetch(l, i+s)) != cmp_point(fetch(f, i)) )
554 goto snext;
555 }
556 return TRUE;
557
558 snext:;
559 }
560 }
561
562 return FALSE;
563 }
564 case STR_MATCH_WORD:
565 { if ( f->length <= l->length )
566 { unsigned int i, s;
567
568 for(s=0; s+f->length <= l->length; s = nextword(l, s))
569 { for(i=0; i<f->length; i++)
570 { if ( cmp_point(fetch(l, i+s)) != cmp_point(fetch(f, i)) )
571 goto wnext;
572 }
573 if ( i+s == l->length || !iswalnum(fetch(l,i+s)) )
574 return TRUE;
575
576 wnext:;
577 }
578 }
579
580 return FALSE;
581 }
582 case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
583 { unsigned int ip, il;
584 typedef struct chp { unsigned int ip;
585 unsigned int il;
586 } chp;
587 chp chps[MAX_LIKE_CHOICES];
588 int chn=0;
589
590 for(ip=il=0; il < l->length && ip < f->length; ip++, il++ )
591 { if ( fetch(f, ip) == '*' )
592 { ip++;
593
594 if ( ip == f->length ) /* foo* */
595 return TRUE;
596
597 search_like:
598 while ( il < l->length &&
599 cmp_point(fetch(l, il)) != cmp_point(fetch(f, ip)) )
600 il++;
601
602 if ( il < l->length )
603 { if ( chn >= MAX_LIKE_CHOICES )
604 { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
605 MAX_LIKE_CHOICES);
606 return FALSE;
607 }
608 chps[chn].ip = ip;
609 chps[chn].il = il+1;
610 chn++;
611
612 continue;
613 } else
614 goto retry_like;
615 }
616
617 if ( cmp_point(fetch(l, il)) != cmp_point(fetch(f, ip)) )
618 goto retry_like;
619 }
620 if ( il == l->length && (ip == f->length ||
621 (fetch(f,ip) == '*' && ip+1 == f->length)) )
622 return TRUE;
623
624 retry_like:
625 if ( chn > 0 )
626 { chn--;
627 ip = chps[chn].ip;
628 il = chps[chn].il;
629 goto search_like;
630 }
631
632 return FALSE;
633 }
634 default:
635 assert(0);
636 return FALSE;
637 }
638 }
639
640
641 /*******************************
642 * LANGUAGE MATCH *
643 *******************************/
644
645 typedef struct lang_choice
646 { int langp; /* points after - */
647 int patp; /* points after *- */
648 } lang_choice;
649
650 #define MAX_CHOICES 10 /* Max number of stars */
651
652 typedef struct
653 { int il, ip;
654 text l, p;
655 lang_choice choicepoints[MAX_CHOICES];
656 int choice_count;
657 } lang_state;
658
659
660 static int
create_chp(lang_state * s)661 create_chp(lang_state *s)
662 { if ( s->choice_count < MAX_CHOICES )
663 { lang_choice *cp = &s->choicepoints[s->choice_count];
664
665 cp->langp = s->il;
666 cp->patp = s->ip+2;
667 s->choice_count++;
668
669 return TRUE;
670 }
671
672 return FALSE;
673 }
674
675
676 static int
next_choice(lang_state * s)677 next_choice(lang_state *s)
678 { for ( ; s->choice_count > 0; s->choice_count-- )
679 { lang_choice *cp = &s->choicepoints[s->choice_count-1];
680 int il = cp->langp;
681
682 for(; il<s->l.length; il++)
683 { if ( fetch(&s->l, il) == '-' )
684 { cp->langp = s->il = il+1;
685 s->ip = cp->patp;
686 return TRUE;
687 }
688 }
689 }
690
691 return FALSE;
692 }
693
694
695 static atom_t ATOM_;
696 static atom_t ATOM_star;
697
698 int
atom_lang_matches(atom_t lang,atom_t pattern)699 atom_lang_matches(atom_t lang, atom_t pattern)
700 { lang_state s = {0};
701 int cl, cp;
702
703 if ( lang == pattern ) /* exact match */
704 return TRUE;
705
706 if ( !ATOM_ )
707 { ATOM_ = PL_new_atom("");
708 ATOM_star = PL_new_atom("*");
709 }
710
711 if ( lang == ATOM_ ) /* no language */
712 return FALSE;
713 if ( pattern == ATOM_star ) /* Everything matches "*" */
714 return TRUE;
715
716 if ( !fetch_atom_text(lang, &s.l) ||
717 !fetch_atom_text(pattern, &s.p) )
718 return FALSE; /* exception? */
719
720 s.il=0; s.ip=0;
721 for(;; s.ip++, s.il++)
722 { if ( s.ip == s.p.length )
723 return TRUE;
724 if ( s.il == s.l.length )
725 { if ( fetch(&s.p, s.ip) == '*' )
726 return TRUE;
727 if ( !next_choice(&s) )
728 return FALSE;
729 }
730
731 cl = fetch(&s.l, s.il);
732 cp = fetch(&s.p, s.ip);
733 if ( cl == cp )
734 continue;
735 if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
736 continue;
737
738 if ( cp == '*' )
739 { if ( s.ip+1 == s.p.length )
740 return TRUE;
741 if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
742 fetch(&s.p, s.ip+1) == '-' )
743 { if ( !create_chp(&s) )
744 return FALSE;
745 }
746 }
747
748 if ( !next_choice(&s) )
749 return FALSE;
750 }
751 }
752