1 /*  Part of SWI-Prolog
2 
3     Author:        Jan Wielemaker
4     E-mail:        J.Wielemaker@vu.nl
5     WWW:           http://www.swi-prolog.org
6     Copyright (c)  2006-2016, University of Amsterdam
7                               VU University Amsterdam
8     All rights reserved.
9 
10     Redistribution and use in source and binary forms, with or without
11     modification, are permitted provided that the following conditions
12     are met:
13 
14     1. Redistributions of source code must retain the above copyright
15        notice, this list of conditions and the following disclaimer.
16 
17     2. Redistributions in binary form must reproduce the above copyright
18        notice, this list of conditions and the following disclaimer in
19        the documentation and/or other materials provided with the
20        distribution.
21 
22     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33     POSSIBILITY OF SUCH DAMAGE.
34 */
35 
36 #include <config.h>
37 #include <SWI-Stream.h>
38 #include <SWI-Prolog.h>
39 #include "rdf_db.h"
40 #include "atom.h"
41 #include "murmur.h"
42 #include <wchar.h>
43 #include <wctype.h>
44 #include <assert.h>
45 
46 #ifdef __WINDOWS__
47 #define inline __inline
48 #endif
49 
50 #include "unicode_map.c"
51 
52 
53 		 /*******************************
54 		 *	   TEXT HANDLING	*
55 		 *******************************/
56 
57 int
fetch_atom_text(atom_t atom,text * txt)58 fetch_atom_text(atom_t atom, text *txt)
59 { if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
60   { txt->w = NULL;
61     return TRUE;
62   }
63   if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
64   { txt->a = NULL;
65     return TRUE;
66   }
67 
68   return FALSE;
69 }
70 
71 
72 static inline wint_t
fetch(const text * txt,int i)73 fetch(const text *txt, int i)
74 { return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
75 }
76 
77 
78 int
fill_atom_info(atom_info * info)79 fill_atom_info(atom_info *info)
80 { if ( !info->resolved )
81   { info->resolved = TRUE;
82 
83     if ( !(info->rc=fetch_atom_text(info->handle, &info->text)) )
84     { info->text.a = NULL;
85       info->text.w = NULL;
86     }
87   }
88 
89   return info->rc;
90 }
91 
92 
93 		 /*******************************
94 		 *	      COMPARE		*
95 		 *******************************/
96 
97 static inline int
cmpA(int c1,int c2,int * dl2)98 cmpA(int c1, int c2, int *dl2)
99 { if ( c1 == c2 )
100   { return 0;
101   } else
102   { int k1 = sort_pointA(c1);
103     int k2 = sort_pointA(c2);
104     int d;
105 
106     if ( (d=((k1>>8)-(k2>>8))) == 0 )
107     { if ( *dl2 == 0 )
108 	*dl2 = (k1&0xff) - (k2&0xff);
109     }
110 
111     return d;
112   }
113 }
114 
115 
116 static inline int
cmpW(int c1,int c2,int * dl2)117 cmpW(int c1, int c2, int *dl2)
118 { if ( c1 == c2 )
119   { return 0;
120   } else
121   { int k1 = sort_point(c1);
122     int k2 = sort_point(c2);
123     int d;
124 
125     if ( (d=((k1>>8)-(k2>>8))) == 0 )
126     { if ( *dl2 == 0 )
127 	*dl2 = (k1&0xff) - (k2&0xff);
128     }
129 
130     return d;
131   }
132 }
133 
134 
135 int
cmp_atom_info(atom_info * info,atom_t a2)136 cmp_atom_info(atom_info *info, atom_t a2)
137 { text t2;
138   int i;
139   int dl2 = 0;
140   size_t n;
141 
142   if ( info->handle == a2 )
143     return 0;
144 
145   if ( !fill_atom_info(info) ||
146        !fetch_atom_text(a2, &t2) )
147   { goto cmphandles;			/* non-text atoms? */
148   }
149 
150   if ( info->text.a && t2.a )
151   { const charA *s1 = info->text.a;
152     const charA *s2 = t2.a;
153     int d;
154 
155     while((d=cmpA(*s1, *s2, &dl2)) == 0)
156     { if ( *s1 == 0 )
157 	goto eq;
158       s1++, s2++;
159     }
160     return d;
161   }
162 
163   n = (info->text.length < t2.length ? info->text.length : t2.length);
164 
165   if ( info->text.w && t2.w )
166   { const charW *s1 = info->text.w;
167     const charW *s2 = t2.w;
168 
169     for(;;s1++, s2++)
170     { if ( n-- == 0 )
171       { if ( info->text.length == t2.length )
172 	  goto eq;
173 
174 	return info->text.length < t2.length ? -1 : 1;
175       } else
176       { int d;
177 
178 	if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
179 	  return d;
180       }
181     }
182   }
183 
184   for(i=0; ; i++)
185   { if ( n-- == 0 )
186     { if ( info->text.length == t2.length )
187 	  goto eq;
188 
189       return info->text.length < t2.length ? -1 : 1;
190     } else
191     { wint_t c1 = fetch(&info->text, i);
192       wint_t c2 = fetch(&t2, i);
193       int d;
194 
195       if ( (d=cmpW(c1, c2, &dl2)) != 0 )
196 	return d;
197     }
198   }
199 
200 eq:
201   if ( dl2 )
202     return dl2;
203 
204 cmphandles:
205   return info->handle < a2 ? -1 : 1;		/* == already covered */
206 }
207 
208 
209 int
cmp_atoms(atom_t a1,atom_t a2)210 cmp_atoms(atom_t a1, atom_t a2)
211 { atom_info info = {0};
212 
213   if ( a1 == a2 )
214     return 0;
215 
216   info.handle = a1;
217 
218   return cmp_atom_info(&info, a2);
219 }
220 
221 
222 		 /*******************************
223 		 *	       HASH		*
224 		 *******************************/
225 
226 static unsigned int
string_hashA(const char * s,size_t len)227 string_hashA(const char *s, size_t len)
228 { const unsigned char *t = (const unsigned char *)s;
229   unsigned int hash = 0;
230 
231   while( len>0 )
232   { unsigned char buf[256];
233     unsigned char *o = buf-1;
234     int cp = len > 256 ? 256 : (int)len;
235     const unsigned char *e = t+cp;
236 
237     t--;
238     while(++t<e)
239       *++o = sort_pointA(*t)>>8;
240     hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
241 
242     len -= cp;
243   }
244 
245   return hash;
246 }
247 
248 
249 static unsigned int
string_hashW(const wchar_t * t,size_t len)250 string_hashW(const wchar_t *t, size_t len)
251 { unsigned int hash = 0;
252 
253   while( len>0 )
254   { unsigned short buf[256];
255     unsigned short *o = buf;
256     int cp = len > 256 ? 256 : (int)len;
257     const wchar_t *e = t+cp;
258 
259     while(t<e)
260       *o++ = (short)(sort_point(*t++)>>8);
261     hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
262 
263     len -= cp;
264   }
265 
266   return hash;
267 }
268 
269 
270 unsigned int
atom_hash_case(atom_t a)271 atom_hash_case(atom_t a)
272 { const char *s;
273   const wchar_t *w;
274   size_t len;
275 
276   if ( (s = PL_atom_nchars(a, &len)) )
277     return string_hashA(s, len);
278   else if ( (w = PL_atom_wchars(a, &len)) )
279     return string_hashW(w, len);
280   else
281   { assert(0);
282     return 0;
283   }
284 }
285 
286 
287 		 /*******************************
288 		 *	    FIND FIRST		*
289 		 *******************************/
290 
291 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
292 Given an atom, return a new  one   that  has all its characters modified
293 such that it appears first in the   set  of atoms considered equal after
294 case canonisation and diacritics removal. This   is  required for prefix
295 search to find the first atom of the set.
296 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
297 
298 atom_t
first_atom(atom_t a,int match)299 first_atom(atom_t a, int match)
300 { text t;
301 
302   if ( !fetch_atom_text(a, &t) )
303   { return (atom_t)0;			/* not a textual atom */
304   } else
305   { size_t len = t.length;
306     wchar_t buf[256];
307     wchar_t *out, *s;
308     int i;
309     atom_t rc;
310 
311     if ( len <= 256 )
312       out = buf;
313     else
314       out = PL_malloc(len*sizeof(wchar_t));
315 
316     for(s=out,i=0; i<len; s++,i++)
317     { wint_t c = fetch(&t,i);
318 
319       if ( c == '*' && match == STR_MATCH_LIKE )
320       { if ( i == 0 )			/* like '*...' */
321 	{ rc = 0;
322 	  goto out;
323 	}
324 	len = i;			/* only up to the first * */
325 	break;
326       }
327       *s = sort_point(c)>>8;
328     }
329 
330     rc = PL_new_atom_wchars(len, out);
331 
332   out:
333     if ( out != buf )
334       PL_free(out);
335 
336     return rc;
337   }
338 }
339 
340 		 /*******************************
341 		 *	       MATCH		*
342 		 *******************************/
343 
344 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
345 With the introduction of wide characters there   are two versions of the
346 match() function, one using char* and one using a structure and index to
347 fetch characters. Overall performance of  the   first  function is about
348 twice as good as the general one  and   as  most data will be handled by
349 this function in  practice  I  think  it   is  worthwhile  to  have  two
350 implementations. Both implementations are  very   similar  in design and
351 likely to have the same bugs. If  you   find  one, please fix it in both
352 branches!
353 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
354 
355 static const charA *
nextwordA(const charA * s)356 nextwordA(const charA *s)
357 { while(*s && iswalnum(*s))
358     s++;
359   while(*s && !iswalnum(*s))
360     s++;
361 
362   return s;
363 }
364 
365 
366 #define cmp_pointA(i) (sort_pointA(i)>>8)
367 
368 
369 static int
matchA(int how,const charA * f,const charA * l)370 matchA(int how, const charA *f, const charA *l)
371 { switch(how)
372   { case STR_MATCH_ICASE:
373     { for( ; *l && *f; l++, f++ )
374       { if ( cmp_pointA(*l) != cmp_pointA(*f) )
375 	  return FALSE;
376       }
377       if ( *l == '\0' && *f == '\0' )
378 	return TRUE;
379 
380       return FALSE;
381     }
382     case STR_MATCH_PREFIX:
383     { for( ; *l && *f; l++, f++ )
384       { if ( cmp_pointA(*l) != cmp_pointA(*f) )
385 	  return FALSE;
386       }
387       if ( *f == '\0' )
388 	return TRUE;
389 
390       return FALSE;
391     }
392     case STR_MATCH_SUBSTRING:		/* use Boyle-More! */
393     { const charA *h;
394       const charA *f0 = f;
395 
396       for(h=l; *h; h++)
397       { for( l=h,f=f0; *l && *f; l++, f++ )
398 	{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
399 	    break;
400 	}
401 	if ( *f == '\0' )
402 	  return TRUE;
403 	if ( *h == '\0' )
404 	  return FALSE;
405       }
406 
407       return FALSE;
408     }
409     case STR_MATCH_WORD:
410     { const charA *h;
411       const charA *f0 = f;
412 
413       for(h=l; *h; h = nextwordA(h))
414       { for( l=h,f=f0; *l && *f; l++, f++ )
415 	{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
416 	    break;
417 	}
418 	if ( *f == '\0' )
419 	{ if ( *l == '\0' || !iswalnum(*l) )
420 	    return TRUE;
421 	}
422 	if ( *l == '\0' )
423 	  return FALSE;
424       }
425 
426       return FALSE;
427     }
428     case STR_MATCH_LIKE:		/* SeRQL like: * --> wildcart */
429     { typedef struct chp { const charA *pattern;
430 			   const charA *label; } chp;
431       chp chps[MAX_LIKE_CHOICES];
432       int chn=0;
433 
434       for( ; *l && *f; l++, f++ )
435       { if ( *f == '*' )
436 	{ f++;
437 
438 	  if ( *f == '\0' )		/* foo* */
439 	    return TRUE;
440 
441 	search_like:
442 	  while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
443 	    l++;
444 
445 	  if ( *l )
446 	  { if ( chn >= MAX_LIKE_CHOICES )
447 	    { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
448 		       MAX_LIKE_CHOICES);
449 	      return FALSE;
450 	    }
451 	    chps[chn].pattern = f;
452 	    chps[chn].label   = l+1;
453 	    chn++;
454 
455 	    continue;
456 	  } else
457 	    goto retry_like;
458 	}
459 
460 	if ( cmp_pointA(*l) != cmp_pointA(*f) )
461 	  goto retry_like;
462       }
463       if ( *l == '\0' && (*f == '\0' ||
464 			 (*f == '*' && f[1] == '\0')) )
465 	return TRUE;
466 
467 retry_like:
468       if ( chn > 0 )
469       { chn--;
470 	f = chps[chn].pattern;
471 	l = chps[chn].label;
472 	goto search_like;
473       }
474 
475       return FALSE;
476     }
477     default:
478       assert(0);
479       return FALSE;
480   }
481 }
482 
483 
484 static unsigned int
nextword(text * txt,unsigned int i)485 nextword(text *txt, unsigned int i)
486 { while(i<txt->length && iswalnum(fetch(txt, i)))
487     i++;
488   while(i<txt->length && !iswalnum(fetch(txt, i)))
489     i++;
490 
491   return i;
492 }
493 
494 
495 #define cmp_point(i) (sort_point(i)>>8)
496 
497 
498 int
match_atoms(int how,atom_t search,atom_t label)499 match_atoms(int how, atom_t search, atom_t label)
500 { text l, f;
501 
502   if ( !fetch_atom_text(label, &l) ||
503        !fetch_atom_text(search, &f) )
504     return FALSE;			/* error? */
505 
506   return match_text(how, &f, &l);
507 }
508 
509 
510 int
match_text(int how,text * f,text * l)511 match_text(int how, text *f, text *l)
512 { if ( f->length == 0 )
513     return TRUE;
514 
515   if ( f->a && l->a )
516     return matchA(how, f->a, l->a);
517 
518   switch(how)
519   { case STR_MATCH_ICASE:
520     { if ( l->length == f->length )
521       { unsigned int i;
522 
523 	for(i=0; i<l->length; i++ )
524 	{ if ( cmp_point(fetch(l, i)) != cmp_point(fetch(f, i)) )
525 	    return FALSE;
526 	}
527 
528         return TRUE;
529       }
530 
531       return FALSE;
532     }
533     case STR_MATCH_PREFIX:
534     { if ( f->length <= l->length )
535       { unsigned int i;
536 
537 	for(i=0; i<f->length; i++ )
538 	{ if ( cmp_point(fetch(l, i)) != cmp_point(fetch(f, i)) )
539 	    return FALSE;
540 	}
541 
542 	return TRUE;
543       }
544 
545       return FALSE;
546     }
547     case STR_MATCH_SUBSTRING:		/* use Boyle-More! */
548     { if ( f->length <= l->length )
549       { unsigned int i, s;
550 
551 	for(s=0; s+f->length <= l->length; s++)
552 	{ for(i=0; i<f->length; i++)
553 	  { if ( cmp_point(fetch(l, i+s)) != cmp_point(fetch(f, i)) )
554 	      goto snext;
555 	  }
556 	  return TRUE;
557 
558 	snext:;
559 	}
560       }
561 
562       return FALSE;
563     }
564     case STR_MATCH_WORD:
565     { if ( f->length <= l->length )
566       { unsigned int i, s;
567 
568 	for(s=0; s+f->length <= l->length; s = nextword(l, s))
569 	{ for(i=0; i<f->length; i++)
570 	  { if ( cmp_point(fetch(l, i+s)) != cmp_point(fetch(f, i)) )
571 	      goto wnext;
572 	  }
573 	  if ( i+s == l->length || !iswalnum(fetch(l,i+s)) )
574 	    return TRUE;
575 
576 	wnext:;
577 	}
578       }
579 
580       return FALSE;
581     }
582     case STR_MATCH_LIKE:		/* SeRQL like: * --> wildcart */
583     { unsigned int ip, il;
584       typedef struct chp { unsigned int ip;
585 			   unsigned int il;
586 			 } chp;
587       chp chps[MAX_LIKE_CHOICES];
588       int chn=0;
589 
590       for(ip=il=0; il < l->length && ip < f->length; ip++, il++ )
591       { if ( fetch(f, ip) == '*' )
592 	{ ip++;
593 
594 	  if ( ip == f->length )		/* foo* */
595 	    return TRUE;
596 
597 	search_like:
598 	  while ( il < l->length &&
599 		  cmp_point(fetch(l, il)) != cmp_point(fetch(f, ip)) )
600 	    il++;
601 
602 	  if ( il < l->length )
603 	  { if ( chn >= MAX_LIKE_CHOICES )
604 	    { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
605 		       MAX_LIKE_CHOICES);
606 	      return FALSE;
607 	    }
608 	    chps[chn].ip = ip;
609 	    chps[chn].il = il+1;
610 	    chn++;
611 
612 	    continue;
613 	  } else
614 	    goto retry_like;
615 	}
616 
617 	if ( cmp_point(fetch(l, il)) != cmp_point(fetch(f, ip)) )
618 	  goto retry_like;
619       }
620       if ( il == l->length && (ip == f->length ||
621 			      (fetch(f,ip) == '*' && ip+1 == f->length)) )
622 	return TRUE;
623 
624 retry_like:
625       if ( chn > 0 )
626       { chn--;
627 	ip = chps[chn].ip;
628 	il = chps[chn].il;
629 	goto search_like;
630       }
631 
632       return FALSE;
633     }
634     default:
635       assert(0);
636       return FALSE;
637   }
638 }
639 
640 
641 		 /*******************************
642 		 *	  LANGUAGE MATCH	*
643 		 *******************************/
644 
645 typedef struct lang_choice
646 { int langp;				/* points after - */
647   int patp;				/* points after *- */
648 } lang_choice;
649 
650 #define MAX_CHOICES 10			/* Max number of stars */
651 
652 typedef struct
653 { int il, ip;
654   text l, p;
655   lang_choice choicepoints[MAX_CHOICES];
656   int choice_count;
657 } lang_state;
658 
659 
660 static int
create_chp(lang_state * s)661 create_chp(lang_state *s)
662 { if ( s->choice_count < MAX_CHOICES )
663   { lang_choice *cp = &s->choicepoints[s->choice_count];
664 
665     cp->langp = s->il;
666     cp->patp = s->ip+2;
667     s->choice_count++;
668 
669     return TRUE;
670   }
671 
672   return FALSE;
673 }
674 
675 
676 static int
next_choice(lang_state * s)677 next_choice(lang_state *s)
678 { for ( ; s->choice_count > 0; s->choice_count-- )
679   { lang_choice *cp = &s->choicepoints[s->choice_count-1];
680     int il = cp->langp;
681 
682     for(; il<s->l.length; il++)
683     { if ( fetch(&s->l, il) == '-' )
684       { cp->langp = s->il = il+1;
685 	s->ip = cp->patp;
686 	return TRUE;
687       }
688     }
689   }
690 
691   return FALSE;
692 }
693 
694 
695 static atom_t ATOM_;
696 static atom_t ATOM_star;
697 
698 int
atom_lang_matches(atom_t lang,atom_t pattern)699 atom_lang_matches(atom_t lang, atom_t pattern)
700 { lang_state s = {0};
701   int cl, cp;
702 
703   if ( lang == pattern )		/* exact match */
704     return TRUE;
705 
706   if ( !ATOM_ )
707   { ATOM_ = PL_new_atom("");
708     ATOM_star = PL_new_atom("*");
709   }
710 
711   if ( lang == ATOM_ )			/* no language */
712     return FALSE;
713   if ( pattern == ATOM_star )		/* Everything matches "*" */
714     return TRUE;
715 
716   if ( !fetch_atom_text(lang, &s.l) ||
717        !fetch_atom_text(pattern, &s.p) )
718     return FALSE;			/* exception? */
719 
720   s.il=0; s.ip=0;
721   for(;; s.ip++, s.il++)
722   { if ( s.ip == s.p.length )
723       return TRUE;
724     if ( s.il == s.l.length )
725     { if ( fetch(&s.p, s.ip) == '*' )
726 	return TRUE;
727       if ( !next_choice(&s) )
728 	return FALSE;
729     }
730 
731     cl = fetch(&s.l, s.il);
732     cp = fetch(&s.p, s.ip);
733     if ( cl == cp )
734       continue;
735     if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
736       continue;
737 
738     if ( cp == '*' )
739     { if ( s.ip+1 == s.p.length )
740 	return TRUE;
741       if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
742 	   fetch(&s.p, s.ip+1) == '-' )
743       { if ( !create_chp(&s) )
744 	  return FALSE;
745       }
746     }
747 
748     if ( !next_choice(&s) )
749       return FALSE;
750   }
751 }
752