1 /* $Id$
2
3 Part of the SWI-Prolog Semweb package
4
5 Author: Jan Wielemaker
6 E-mail: wielemak@science.uva.nl
7 WWW: http://www.swi-prolog.org
8 Copyright (C): 2006, University of Amsterdam
9
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License
12 as published by the Free Software Foundation; either version 2
13 of the License, or (at your option) any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with this library; if not, write to the Free Software
22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #ifdef HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28
29 #include <SWI-Stream.h>
30 #include <SWI-Prolog.h>
31 #include "atom.h"
32 #include "murmur.h"
33 #include <wchar.h>
34 #include <wctype.h>
35 #include <assert.h>
36
37 #ifdef __WINDOWS__
38 #define inline __inline
39 #endif
40
41 #include "unicode_map.c"
42
43
44 /*******************************
45 * TEXT HANDLING *
46 *******************************/
47
48 static inline int
get_atom_text(atom_t atom,text * txt)49 get_atom_text(atom_t atom, text *txt)
50 { if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
51 { txt->w = NULL;
52 return TRUE;
53 }
54 if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
55 { txt->a = NULL;
56 return TRUE;
57 }
58
59 return FALSE;
60 }
61
62
63 inline wint_t
fetch(const text * txt,int i)64 fetch(const text *txt, int i)
65 { return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
66 }
67
68
69 static int
fill_atom_info(atom_info * info)70 fill_atom_info(atom_info *info)
71 { if ( !info->resolved )
72 { info->resolved = TRUE;
73
74 if ( !(info->rc=get_atom_text(info->handle, &info->text)) )
75 { info->text.a = NULL;
76 info->text.w = NULL;
77 }
78 }
79
80 return info->rc;
81 }
82
83
84 /*******************************
85 * COMPARE *
86 *******************************/
87
88 static inline int
cmpA(int c1,int c2,int * dl2)89 cmpA(int c1, int c2, int *dl2)
90 { if ( c1 == c2 )
91 { return 0;
92 } else
93 { int k1 = sort_pointA(c1);
94 int k2 = sort_pointA(c2);
95 int d;
96
97 if ( (d=((k1>>8)-(k2>>8))) == 0 )
98 { if ( *dl2 == 0 )
99 *dl2 = (k1&0xff) - (k2&0xff);
100 }
101
102 return d;
103 }
104 }
105
106
107 static inline int
cmpW(int c1,int c2,int * dl2)108 cmpW(int c1, int c2, int *dl2)
109 { if ( c1 == c2 )
110 { return 0;
111 } else
112 { int k1 = sort_point(c1);
113 int k2 = sort_point(c2);
114 int d;
115
116 if ( (d=((k1>>8)-(k2>>8))) == 0 )
117 { if ( *dl2 == 0 )
118 *dl2 = (k1&0xff) - (k2&0xff);
119 }
120
121 return d;
122 }
123 }
124
125
126 int
cmp_atom_info(atom_info * info,atom_t a2)127 cmp_atom_info(atom_info *info, atom_t a2)
128 { text t2;
129 int i;
130 int dl2 = 0;
131 size_t n;
132
133 if ( info->handle == a2 )
134 return 0;
135
136 if ( !fill_atom_info(info) ||
137 !get_atom_text(a2, &t2) )
138 { goto cmphandles; /* non-text atoms? */
139 }
140
141 if ( info->text.a && t2.a )
142 { const charA *s1 = info->text.a;
143 const charA *s2 = t2.a;
144 int d;
145
146 while((d=cmpA(*s1, *s2, &dl2)) == 0)
147 { if ( *s1 == 0 )
148 goto eq;
149 s1++, s2++;
150 }
151 return d;
152 }
153
154 n = (info->text.length < t2.length ? info->text.length : t2.length);
155
156 if ( info->text.w && t2.w )
157 { const charW *s1 = info->text.w;
158 const charW *s2 = t2.w;
159
160 for(;;s1++, s2++)
161 { if ( n-- == 0 )
162 { if ( info->text.length == t2.length )
163 goto eq;
164
165 return info->text.length < t2.length ? -1 : 1;
166 } else
167 { int d;
168
169 if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
170 return d;
171 }
172 }
173 }
174
175 for(i=0; ; i++)
176 { if ( n-- == 0 )
177 { if ( info->text.length == t2.length )
178 goto eq;
179
180 return info->text.length < t2.length ? -1 : 1;
181 } else
182 { wint_t c1 = fetch(&info->text, i);
183 wint_t c2 = fetch(&t2, i);
184 int d;
185
186 if ( (d=cmpW(c1, c2, &dl2)) != 0 )
187 return d;
188 }
189 }
190
191 eq:
192 if ( dl2 )
193 return dl2;
194
195 cmphandles:
196 return info->handle < a2 ? -1 : 1; /* == already covered */
197 }
198
199
200 int
cmp_atoms(atom_t a1,atom_t a2)201 cmp_atoms(atom_t a1, atom_t a2)
202 { atom_info info = {0};
203
204 if ( a1 == a2 )
205 return 0;
206
207 info.handle = a1;
208
209 return cmp_atom_info(&info, a2);
210 }
211
212
213 /*******************************
214 * HASH *
215 *******************************/
216
217 static unsigned int
string_hashA(const char * s,size_t len)218 string_hashA(const char *s, size_t len)
219 { const unsigned char *t = (const unsigned char *)s;
220 unsigned int hash = 0;
221
222 while( len>0 )
223 { unsigned char buf[256];
224 unsigned char *o = buf-1;
225 int cp = len > 256 ? 256 : (int)len;
226 const unsigned char *e = t+cp;
227
228 t--;
229 while(++t<e)
230 *++o = sort_pointA(*t)>>8;
231 hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
232
233 len -= cp;
234 }
235
236 return hash;
237 }
238
239
240 static unsigned int
string_hashW(const wchar_t * t,size_t len)241 string_hashW(const wchar_t *t, size_t len)
242 { unsigned int hash = 0;
243
244 while( len>0 )
245 { unsigned short buf[256];
246 unsigned short *o = buf;
247 int cp = len > 256 ? 256 : (int)len;
248 const wchar_t *e = t+cp;
249
250 while(t<e)
251 *o++ = (short)(sort_point(*t++)>>8);
252 hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
253
254 len -= cp;
255 }
256
257 return hash;
258 }
259
260
261 unsigned int
atom_hash_case(atom_t a)262 atom_hash_case(atom_t a)
263 { const char *s;
264 const wchar_t *w;
265 size_t len;
266
267 if ( (s = PL_atom_nchars(a, &len)) )
268 return string_hashA(s, len);
269 else if ( (w = PL_atom_wchars(a, &len)) )
270 return string_hashW(w, len);
271 else
272 { assert(0);
273 return 0;
274 }
275 }
276
277
278 /*******************************
279 * FIND FIRST *
280 *******************************/
281
282 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
283 Given an atom, return a new one that has all its characters modified
284 such that it appears first in the set of atoms considered equal after
285 case canonisation and diacritics removal. This is required for prefix
286 search to find the first atom of the set.
287 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
288
289 atom_t
first_atom(atom_t a,int match)290 first_atom(atom_t a, int match)
291 { text t;
292
293 if ( !get_atom_text(a, &t) )
294 { return (atom_t)0; /* not a textual atom */
295 } else
296 { size_t len = t.length;
297 wchar_t buf[256];
298 wchar_t *out, *s;
299 int i;
300 wint_t c;
301 atom_t rc;
302
303 if ( len <= 256 )
304 out = buf;
305 else
306 out = PL_malloc(len*sizeof(wchar_t));
307
308 for(s=out,i=0; (c=fetch(&t,i)); s++,i++)
309 { if ( c == '*' && match == STR_MATCH_LIKE )
310 { if ( i == 0 ) /* like '*...' */
311 return (atom_t)0;
312 len = i; /* only up to the first * */
313 }
314 *s = sort_point(c)>>8;
315 }
316
317 rc = PL_new_atom_wchars(len, out);
318
319 if ( out != buf )
320 PL_free(out);
321
322 return rc;
323 }
324 }
325
326 /*******************************
327 * MATCH *
328 *******************************/
329
330 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
331 With the introduction of wide characters there are two versions of the
332 match() function, one using char* and one using a structure and index to
333 fetch characters. Overall performance of the first function is about
334 twice as good as the general one and as most data will be handled by
335 this function in practice I think it is worthwhile to have two
336 implementations. Both implementations are very similar in design and
337 likely to have the same bugs. If you find one, please fix it in both
338 branches!
339 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
340
341 static const charA *
nextwordA(const charA * s)342 nextwordA(const charA *s)
343 { while(*s && iswalnum(*s))
344 s++;
345 while(*s && !iswalnum(*s))
346 s++;
347
348 return s;
349 }
350
351
352 #define cmp_pointA(i) (sort_pointA(i)>>8)
353
354
355 static int
matchA(int how,const charA * f,const charA * l)356 matchA(int how, const charA *f, const charA *l)
357 { switch(how)
358 { case STR_MATCH_EXACT:
359 { for( ; *l && *f; l++, f++ )
360 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
361 return FALSE;
362 }
363 if ( *l == '\0' && *f == '\0' )
364 return TRUE;
365
366 return FALSE;
367 }
368 case STR_MATCH_PREFIX:
369 { for( ; *l && *f; l++, f++ )
370 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
371 return FALSE;
372 }
373 if ( *f == '\0' )
374 return TRUE;
375
376 return FALSE;
377 }
378 case STR_MATCH_SUBSTRING: /* use Boyle-More! */
379 { const charA *h;
380 const charA *f0 = f;
381
382 for(h=l; *h; h++)
383 { for( l=h,f=f0; *l && *f; l++, f++ )
384 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
385 break;
386 }
387 if ( *f == '\0' )
388 return TRUE;
389 if ( *h == '\0' )
390 return FALSE;
391 }
392
393 return FALSE;
394 }
395 case STR_MATCH_WORD:
396 { const charA *h;
397 const charA *f0 = f;
398
399 for(h=l; *h; h = nextwordA(h))
400 { for( l=h,f=f0; *l && *f; l++, f++ )
401 { if ( cmp_pointA(*l) != cmp_pointA(*f) )
402 break;
403 }
404 if ( *f == '\0' )
405 { if ( *l == '\0' || !iswalnum(*l) )
406 return TRUE;
407 }
408 if ( *l == '\0' )
409 return FALSE;
410 }
411
412 return FALSE;
413 }
414 case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
415 { typedef struct chp { const charA *pattern;
416 const charA *label; } chp;
417 chp chps[MAX_LIKE_CHOICES];
418 int chn=0;
419
420 for( ; *l && *f; l++, f++ )
421 { if ( *f == '*' )
422 { f++;
423
424 if ( *f == '\0' ) /* foo* */
425 return TRUE;
426
427 search_like:
428 while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
429 l++;
430
431 if ( *l )
432 { if ( chn >= MAX_LIKE_CHOICES )
433 { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
434 MAX_LIKE_CHOICES);
435 return FALSE;
436 }
437 chps[chn].pattern = f;
438 chps[chn].label = l+1;
439 chn++;
440
441 continue;
442 } else
443 goto retry_like;
444 }
445
446 if ( cmp_pointA(*l) != cmp_pointA(*f) )
447 goto retry_like;
448 }
449 if ( *l == '\0' && (*f == '\0' ||
450 (*f == '*' && f[1] == '\0')) )
451 return TRUE;
452
453 retry_like:
454 if ( chn > 0 )
455 { chn--;
456 f = chps[chn].pattern;
457 l = chps[chn].label;
458 goto search_like;
459 }
460
461 return FALSE;
462 }
463 default:
464 assert(0);
465 return FALSE;
466 }
467 }
468
469
470 static unsigned int
nextword(text * txt,unsigned int i)471 nextword(text *txt, unsigned int i)
472 { while(i<txt->length && iswalnum(fetch(txt, i)))
473 i++;
474 while(i<txt->length && !iswalnum(fetch(txt, i)))
475 i++;
476
477 return i;
478 }
479
480
481 #define cmp_point(i) (sort_point(i)>>8)
482
483
484 int
match_atoms(int how,atom_t search,atom_t label)485 match_atoms(int how, atom_t search, atom_t label)
486 { text l, f;
487
488 if ( !get_atom_text(label, &l) ||
489 !get_atom_text(search, &f) )
490 return FALSE; /* error? */
491
492 if ( f.length == 0 )
493 return TRUE;
494
495 if ( f.a && l.a )
496 return matchA(how, f.a, l.a);
497
498 switch(how)
499 { case STR_MATCH_EXACT:
500 { if ( l.length == f.length )
501 { unsigned int i;
502
503 for(i=0; i<l.length; i++ )
504 { if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
505 return FALSE;
506 }
507
508 return TRUE;
509 }
510
511 return FALSE;
512 }
513 case STR_MATCH_PREFIX:
514 { if ( f.length <= l.length )
515 { unsigned int i;
516
517 for(i=0; i<f.length; i++ )
518 { if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
519 return FALSE;
520 }
521
522 return TRUE;
523 }
524
525 return FALSE;
526 }
527 case STR_MATCH_SUBSTRING: /* use Boyle-More! */
528 { if ( f.length <= l.length )
529 { unsigned int i, s;
530
531 for(s=0; s+f.length <= l.length; s++)
532 { for(i=0; i<f.length; i++)
533 { if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
534 goto snext;
535 }
536 return TRUE;
537
538 snext:;
539 }
540 }
541
542 return FALSE;
543 }
544 case STR_MATCH_WORD:
545 { if ( f.length <= l.length )
546 { unsigned int i, s;
547
548 for(s=0; s+f.length <= l.length; s = nextword(&l, s))
549 { for(i=0; i<f.length; i++)
550 { if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
551 goto wnext;
552 }
553 if ( i+s == l.length || !iswalnum(fetch(&l,i+s)) )
554 return TRUE;
555
556 wnext:;
557 }
558 }
559
560 return FALSE;
561 }
562 case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
563 { unsigned int ip, il;
564 typedef struct chp { unsigned int ip;
565 unsigned int il;
566 } chp;
567 chp chps[MAX_LIKE_CHOICES];
568 int chn=0;
569
570 for(ip=il=0; il < l.length && ip < f.length; ip++, il++ )
571 { if ( fetch(&f, ip) == '*' )
572 { ip++;
573
574 if ( ip == f.length ) /* foo* */
575 return TRUE;
576
577 search_like:
578 while ( il < l.length &&
579 cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
580 il++;
581
582 if ( il < l.length )
583 { if ( chn >= MAX_LIKE_CHOICES )
584 { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
585 MAX_LIKE_CHOICES);
586 return FALSE;
587 }
588 chps[chn].ip = ip;
589 chps[chn].il = il+1;
590 chn++;
591
592 continue;
593 } else
594 goto retry_like;
595 }
596
597 if ( cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
598 goto retry_like;
599 }
600 if ( il == l.length && (ip == f.length ||
601 (fetch(&f,ip) == '*' && ip+1 == f.length)) )
602 return TRUE;
603
604 retry_like:
605 if ( chn > 0 )
606 { chn--;
607 ip = chps[chn].ip;
608 il = chps[chn].il;
609 goto search_like;
610 }
611
612 return FALSE;
613 }
614 default:
615 assert(0);
616 return FALSE;
617 }
618 }
619
620
621 /*******************************
622 * LANGUAGE MATCH *
623 *******************************/
624
625 typedef struct lang_choice
626 { int langp; /* points after - */
627 int patp; /* points after *- */
628 } lang_choice;
629
630 #define MAX_CHOICES 10 /* Max number of stars */
631
632 typedef struct
633 { int il, ip;
634 text l, p;
635 lang_choice choicepoints[MAX_CHOICES];
636 int choice_count;
637 } lang_state;
638
639
640 static int
create_chp(lang_state * s)641 create_chp(lang_state *s)
642 { if ( s->choice_count < MAX_CHOICES )
643 { lang_choice *cp = &s->choicepoints[s->choice_count];
644
645 cp->langp = s->il;
646 cp->patp = s->ip+2;
647 s->choice_count++;
648
649 return TRUE;
650 }
651
652 return FALSE;
653 }
654
655
656 static int
next_choice(lang_state * s)657 next_choice(lang_state *s)
658 { for ( ; s->choice_count > 0; s->choice_count-- )
659 { lang_choice *cp = &s->choicepoints[s->choice_count-1];
660 int il = cp->langp;
661
662 for(; il<s->l.length; il++)
663 { if ( fetch(&s->l, il) == '-' )
664 { cp->langp = s->il = il+1;
665 s->ip = cp->patp;
666 return TRUE;
667 }
668 }
669 }
670
671 return FALSE;
672 }
673
674
675 static atom_t ATOM_;
676 static atom_t ATOM_star;
677
678 int
atom_lang_matches(atom_t lang,atom_t pattern)679 atom_lang_matches(atom_t lang, atom_t pattern)
680 { lang_state s = {0};
681 int cl, cp;
682
683 if ( lang == pattern ) /* exact match */
684 return TRUE;
685
686 if ( !ATOM_ )
687 { ATOM_ = PL_new_atom("");
688 ATOM_star = PL_new_atom("*");
689 }
690
691 if ( lang == ATOM_ ) /* no language */
692 return FALSE;
693 if ( pattern == ATOM_star ) /* Everything matches "*" */
694 return TRUE;
695
696 if ( !get_atom_text(lang, &s.l) ||
697 !get_atom_text(pattern, &s.p) )
698 return FALSE; /* exception? */
699
700 s.il=0; s.ip=0;
701 for(;; s.ip++, s.il++)
702 { if ( s.ip == s.p.length )
703 return TRUE;
704 if ( s.il == s.l.length )
705 { if ( fetch(&s.p, s.ip) == '*' )
706 return TRUE;
707 if ( !next_choice(&s) )
708 return FALSE;
709 }
710
711 cl = fetch(&s.l, s.il);
712 cp = fetch(&s.p, s.ip);
713 if ( cl == cp )
714 continue;
715 if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
716 continue;
717
718 if ( cp == '*' )
719 { if ( s.ip+1 == s.p.length )
720 return TRUE;
721 if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
722 fetch(&s.p, s.ip+1) == '-' )
723 { if ( !create_chp(&s) )
724 return FALSE;
725 }
726 }
727
728 if ( !next_choice(&s) )
729 return FALSE;
730 }
731 }
732