1 /*  $Id$
2 
3     Part of the SWI-Prolog Semweb package
4 
5     Author:        Jan Wielemaker
6     E-mail:        wielemak@science.uva.nl
7     WWW:           http://www.swi-prolog.org
8     Copyright (C): 2006, University of Amsterdam
9 
10     This program is free software; you can redistribute it and/or
11     modify it under the terms of the GNU General Public License
12     as published by the Free Software Foundation; either version 2
13     of the License, or (at your option) any later version.
14 
15     This program is distributed in the hope that it will be useful,
16     but WITHOUT ANY WARRANTY; without even the implied warranty of
17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18     GNU General Public License for more details.
19 
20     You should have received a copy of the GNU Lesser General Public
21     License along with this library; if not, write to the Free Software
22     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23 */
24 
25 #ifdef HAVE_CONFIG_H
26 #include <config.h>
27 #endif
28 
29 #include <SWI-Stream.h>
30 #include <SWI-Prolog.h>
31 #include "atom.h"
32 #include "murmur.h"
33 #include <wchar.h>
34 #include <wctype.h>
35 #include <assert.h>
36 
37 #ifdef __WINDOWS__
38 #define inline __inline
39 #endif
40 
41 #include "unicode_map.c"
42 
43 
44 		 /*******************************
45 		 *	   TEXT HANDLING	*
46 		 *******************************/
47 
48 static inline int
get_atom_text(atom_t atom,text * txt)49 get_atom_text(atom_t atom, text *txt)
50 { if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
51   { txt->w = NULL;
52     return TRUE;
53   }
54   if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
55   { txt->a = NULL;
56     return TRUE;
57   }
58 
59   return FALSE;
60 }
61 
62 
63 inline wint_t
fetch(const text * txt,int i)64 fetch(const text *txt, int i)
65 { return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
66 }
67 
68 
69 static int
fill_atom_info(atom_info * info)70 fill_atom_info(atom_info *info)
71 { if ( !info->resolved )
72   { info->resolved = TRUE;
73 
74     if ( !(info->rc=get_atom_text(info->handle, &info->text)) )
75     { info->text.a = NULL;
76       info->text.w = NULL;
77     }
78   }
79 
80   return info->rc;
81 }
82 
83 
84 		 /*******************************
85 		 *	      COMPARE		*
86 		 *******************************/
87 
88 static inline int
cmpA(int c1,int c2,int * dl2)89 cmpA(int c1, int c2, int *dl2)
90 { if ( c1 == c2 )
91   { return 0;
92   } else
93   { int k1 = sort_pointA(c1);
94     int k2 = sort_pointA(c2);
95     int d;
96 
97     if ( (d=((k1>>8)-(k2>>8))) == 0 )
98     { if ( *dl2 == 0 )
99 	*dl2 = (k1&0xff) - (k2&0xff);
100     }
101 
102     return d;
103   }
104 }
105 
106 
107 static inline int
cmpW(int c1,int c2,int * dl2)108 cmpW(int c1, int c2, int *dl2)
109 { if ( c1 == c2 )
110   { return 0;
111   } else
112   { int k1 = sort_point(c1);
113     int k2 = sort_point(c2);
114     int d;
115 
116     if ( (d=((k1>>8)-(k2>>8))) == 0 )
117     { if ( *dl2 == 0 )
118 	*dl2 = (k1&0xff) - (k2&0xff);
119     }
120 
121     return d;
122   }
123 }
124 
125 
126 int
cmp_atom_info(atom_info * info,atom_t a2)127 cmp_atom_info(atom_info *info, atom_t a2)
128 { text t2;
129   int i;
130   int dl2 = 0;
131   size_t n;
132 
133   if ( info->handle == a2 )
134     return 0;
135 
136   if ( !fill_atom_info(info) ||
137        !get_atom_text(a2, &t2) )
138   { goto cmphandles;			/* non-text atoms? */
139   }
140 
141   if ( info->text.a && t2.a )
142   { const charA *s1 = info->text.a;
143     const charA *s2 = t2.a;
144     int d;
145 
146     while((d=cmpA(*s1, *s2, &dl2)) == 0)
147     { if ( *s1 == 0 )
148 	goto eq;
149       s1++, s2++;
150     }
151     return d;
152   }
153 
154   n = (info->text.length < t2.length ? info->text.length : t2.length);
155 
156   if ( info->text.w && t2.w )
157   { const charW *s1 = info->text.w;
158     const charW *s2 = t2.w;
159 
160     for(;;s1++, s2++)
161     { if ( n-- == 0 )
162       { if ( info->text.length == t2.length )
163 	  goto eq;
164 
165 	return info->text.length < t2.length ? -1 : 1;
166       } else
167       { int d;
168 
169 	if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
170 	  return d;
171       }
172     }
173   }
174 
175   for(i=0; ; i++)
176   { if ( n-- == 0 )
177     { if ( info->text.length == t2.length )
178 	  goto eq;
179 
180       return info->text.length < t2.length ? -1 : 1;
181     } else
182     { wint_t c1 = fetch(&info->text, i);
183       wint_t c2 = fetch(&t2, i);
184       int d;
185 
186       if ( (d=cmpW(c1, c2, &dl2)) != 0 )
187 	return d;
188     }
189   }
190 
191 eq:
192   if ( dl2 )
193     return dl2;
194 
195 cmphandles:
196   return info->handle < a2 ? -1 : 1;		/* == already covered */
197 }
198 
199 
200 int
cmp_atoms(atom_t a1,atom_t a2)201 cmp_atoms(atom_t a1, atom_t a2)
202 { atom_info info = {0};
203 
204   if ( a1 == a2 )
205     return 0;
206 
207   info.handle = a1;
208 
209   return cmp_atom_info(&info, a2);
210 }
211 
212 
213 		 /*******************************
214 		 *	       HASH		*
215 		 *******************************/
216 
217 static unsigned int
string_hashA(const char * s,size_t len)218 string_hashA(const char *s, size_t len)
219 { const unsigned char *t = (const unsigned char *)s;
220   unsigned int hash = 0;
221 
222   while( len>0 )
223   { unsigned char buf[256];
224     unsigned char *o = buf-1;
225     int cp = len > 256 ? 256 : (int)len;
226     const unsigned char *e = t+cp;
227 
228     t--;
229     while(++t<e)
230       *++o = sort_pointA(*t)>>8;
231     hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
232 
233     len -= cp;
234   }
235 
236   return hash;
237 }
238 
239 
240 static unsigned int
string_hashW(const wchar_t * t,size_t len)241 string_hashW(const wchar_t *t, size_t len)
242 { unsigned int hash = 0;
243 
244   while( len>0 )
245   { unsigned short buf[256];
246     unsigned short *o = buf;
247     int cp = len > 256 ? 256 : (int)len;
248     const wchar_t *e = t+cp;
249 
250     while(t<e)
251       *o++ = (short)(sort_point(*t++)>>8);
252     hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
253 
254     len -= cp;
255   }
256 
257   return hash;
258 }
259 
260 
261 unsigned int
atom_hash_case(atom_t a)262 atom_hash_case(atom_t a)
263 { const char *s;
264   const wchar_t *w;
265   size_t len;
266 
267   if ( (s = PL_atom_nchars(a, &len)) )
268     return string_hashA(s, len);
269   else if ( (w = PL_atom_wchars(a, &len)) )
270     return string_hashW(w, len);
271   else
272   { assert(0);
273     return 0;
274   }
275 }
276 
277 
278 		 /*******************************
279 		 *	    FIND FIRST		*
280 		 *******************************/
281 
282 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
283 Given an atom, return a new  one   that  has all its characters modified
284 such that it appears first in the   set  of atoms considered equal after
285 case canonisation and diacritics removal. This   is  required for prefix
286 search to find the first atom of the set.
287 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
288 
289 atom_t
first_atom(atom_t a,int match)290 first_atom(atom_t a, int match)
291 { text t;
292 
293   if ( !get_atom_text(a, &t) )
294   { return (atom_t)0;			/* not a textual atom */
295   } else
296   { size_t len = t.length;
297     wchar_t buf[256];
298     wchar_t *out, *s;
299     int i;
300     wint_t c;
301     atom_t rc;
302 
303     if ( len <= 256 )
304       out = buf;
305     else
306       out = PL_malloc(len*sizeof(wchar_t));
307 
308     for(s=out,i=0; (c=fetch(&t,i)); s++,i++)
309     { if ( c == '*' && match == STR_MATCH_LIKE )
310       { if ( i == 0 )			/* like '*...' */
311 	  return (atom_t)0;
312 	len = i;			/* only up to the first * */
313       }
314       *s = sort_point(c)>>8;
315     }
316 
317     rc = PL_new_atom_wchars(len, out);
318 
319     if ( out != buf )
320       PL_free(out);
321 
322     return rc;
323   }
324 }
325 
326 		 /*******************************
327 		 *	       MATCH		*
328 		 *******************************/
329 
330 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
331 With the introduction of wide characters there   are two versions of the
332 match() function, one using char* and one using a structure and index to
333 fetch characters. Overall performance of  the   first  function is about
334 twice as good as the general one  and   as  most data will be handled by
335 this function in  practice  I  think  it   is  worthwhile  to  have  two
336 implementations. Both implementations are  very   similar  in design and
337 likely to have the same bugs. If  you   find  one, please fix it in both
338 branches!
339 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
340 
341 static const charA *
nextwordA(const charA * s)342 nextwordA(const charA *s)
343 { while(*s && iswalnum(*s))
344     s++;
345   while(*s && !iswalnum(*s))
346     s++;
347 
348   return s;
349 }
350 
351 
352 #define cmp_pointA(i) (sort_pointA(i)>>8)
353 
354 
355 static int
matchA(int how,const charA * f,const charA * l)356 matchA(int how, const charA *f, const charA *l)
357 { switch(how)
358   { case STR_MATCH_EXACT:
359     { for( ; *l && *f; l++, f++ )
360       { if ( cmp_pointA(*l) != cmp_pointA(*f) )
361 	  return FALSE;
362       }
363       if ( *l == '\0' && *f == '\0' )
364 	return TRUE;
365 
366       return FALSE;
367     }
368     case STR_MATCH_PREFIX:
369     { for( ; *l && *f; l++, f++ )
370       { if ( cmp_pointA(*l) != cmp_pointA(*f) )
371 	  return FALSE;
372       }
373       if ( *f == '\0' )
374 	return TRUE;
375 
376       return FALSE;
377     }
378     case STR_MATCH_SUBSTRING:		/* use Boyle-More! */
379     { const charA *h;
380       const charA *f0 = f;
381 
382       for(h=l; *h; h++)
383       { for( l=h,f=f0; *l && *f; l++, f++ )
384 	{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
385 	    break;
386 	}
387 	if ( *f == '\0' )
388 	  return TRUE;
389 	if ( *h == '\0' )
390 	  return FALSE;
391       }
392 
393       return FALSE;
394     }
395     case STR_MATCH_WORD:
396     { const charA *h;
397       const charA *f0 = f;
398 
399       for(h=l; *h; h = nextwordA(h))
400       { for( l=h,f=f0; *l && *f; l++, f++ )
401 	{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
402 	    break;
403 	}
404 	if ( *f == '\0' )
405 	{ if ( *l == '\0' || !iswalnum(*l) )
406 	    return TRUE;
407 	}
408 	if ( *l == '\0' )
409 	  return FALSE;
410       }
411 
412       return FALSE;
413     }
414     case STR_MATCH_LIKE:		/* SeRQL like: * --> wildcart */
415     { typedef struct chp { const charA *pattern;
416 			   const charA *label; } chp;
417       chp chps[MAX_LIKE_CHOICES];
418       int chn=0;
419 
420       for( ; *l && *f; l++, f++ )
421       { if ( *f == '*' )
422 	{ f++;
423 
424 	  if ( *f == '\0' )		/* foo* */
425 	    return TRUE;
426 
427 	search_like:
428 	  while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
429 	    l++;
430 
431 	  if ( *l )
432 	  { if ( chn >= MAX_LIKE_CHOICES )
433 	    { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
434 		       MAX_LIKE_CHOICES);
435 	      return FALSE;
436 	    }
437 	    chps[chn].pattern = f;
438 	    chps[chn].label   = l+1;
439 	    chn++;
440 
441 	    continue;
442 	  } else
443 	    goto retry_like;
444 	}
445 
446 	if ( cmp_pointA(*l) != cmp_pointA(*f) )
447 	  goto retry_like;
448       }
449       if ( *l == '\0' && (*f == '\0' ||
450 			 (*f == '*' && f[1] == '\0')) )
451 	return TRUE;
452 
453 retry_like:
454       if ( chn > 0 )
455       { chn--;
456 	f = chps[chn].pattern;
457 	l = chps[chn].label;
458 	goto search_like;
459       }
460 
461       return FALSE;
462     }
463     default:
464       assert(0);
465       return FALSE;
466   }
467 }
468 
469 
470 static unsigned int
nextword(text * txt,unsigned int i)471 nextword(text *txt, unsigned int i)
472 { while(i<txt->length && iswalnum(fetch(txt, i)))
473     i++;
474   while(i<txt->length && !iswalnum(fetch(txt, i)))
475     i++;
476 
477   return i;
478 }
479 
480 
481 #define cmp_point(i) (sort_point(i)>>8)
482 
483 
484 int
match_atoms(int how,atom_t search,atom_t label)485 match_atoms(int how, atom_t search, atom_t label)
486 { text l, f;
487 
488   if ( !get_atom_text(label, &l) ||
489        !get_atom_text(search, &f) )
490     return FALSE;			/* error? */
491 
492   if ( f.length == 0 )
493     return TRUE;
494 
495   if ( f.a && l.a )
496     return matchA(how, f.a, l.a);
497 
498   switch(how)
499   { case STR_MATCH_EXACT:
500     { if ( l.length == f.length )
501       { unsigned int i;
502 
503 	for(i=0; i<l.length; i++ )
504 	{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
505 	    return FALSE;
506 	}
507 
508         return TRUE;
509       }
510 
511       return FALSE;
512     }
513     case STR_MATCH_PREFIX:
514     { if ( f.length <= l.length )
515       { unsigned int i;
516 
517 	for(i=0; i<f.length; i++ )
518 	{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
519 	    return FALSE;
520 	}
521 
522 	return TRUE;
523       }
524 
525       return FALSE;
526     }
527     case STR_MATCH_SUBSTRING:		/* use Boyle-More! */
528     { if ( f.length <= l.length )
529       { unsigned int i, s;
530 
531 	for(s=0; s+f.length <= l.length; s++)
532 	{ for(i=0; i<f.length; i++)
533 	  { if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
534 	      goto snext;
535 	  }
536 	  return TRUE;
537 
538 	snext:;
539 	}
540       }
541 
542       return FALSE;
543     }
544     case STR_MATCH_WORD:
545     { if ( f.length <= l.length )
546       { unsigned int i, s;
547 
548 	for(s=0; s+f.length <= l.length; s = nextword(&l, s))
549 	{ for(i=0; i<f.length; i++)
550 	  { if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
551 	      goto wnext;
552 	  }
553 	  if ( i+s == l.length || !iswalnum(fetch(&l,i+s)) )
554 	    return TRUE;
555 
556 	wnext:;
557 	}
558       }
559 
560       return FALSE;
561     }
562     case STR_MATCH_LIKE:		/* SeRQL like: * --> wildcart */
563     { unsigned int ip, il;
564       typedef struct chp { unsigned int ip;
565 			   unsigned int il;
566 			 } chp;
567       chp chps[MAX_LIKE_CHOICES];
568       int chn=0;
569 
570       for(ip=il=0; il < l.length && ip < f.length; ip++, il++ )
571       { if ( fetch(&f, ip) == '*' )
572 	{ ip++;
573 
574 	  if ( ip == f.length )		/* foo* */
575 	    return TRUE;
576 
577 	search_like:
578 	  while ( il < l.length &&
579 		  cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
580 	    il++;
581 
582 	  if ( il < l.length )
583 	  { if ( chn >= MAX_LIKE_CHOICES )
584 	    { Sdprintf("rdf_db: too many * in `like' expression (>%d)",
585 		       MAX_LIKE_CHOICES);
586 	      return FALSE;
587 	    }
588 	    chps[chn].ip = ip;
589 	    chps[chn].il = il+1;
590 	    chn++;
591 
592 	    continue;
593 	  } else
594 	    goto retry_like;
595 	}
596 
597 	if ( cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
598 	  goto retry_like;
599       }
600       if ( il == l.length && (ip == f.length ||
601 			      (fetch(&f,ip) == '*' && ip+1 == f.length)) )
602 	return TRUE;
603 
604 retry_like:
605       if ( chn > 0 )
606       { chn--;
607 	ip = chps[chn].ip;
608 	il = chps[chn].il;
609 	goto search_like;
610       }
611 
612       return FALSE;
613     }
614     default:
615       assert(0);
616       return FALSE;
617   }
618 }
619 
620 
621 		 /*******************************
622 		 *	  LANGUAGE MATCH	*
623 		 *******************************/
624 
625 typedef struct lang_choice
626 { int langp;				/* points after - */
627   int patp;				/* points after *- */
628 } lang_choice;
629 
630 #define MAX_CHOICES 10			/* Max number of stars */
631 
632 typedef struct
633 { int il, ip;
634   text l, p;
635   lang_choice choicepoints[MAX_CHOICES];
636   int choice_count;
637 } lang_state;
638 
639 
640 static int
create_chp(lang_state * s)641 create_chp(lang_state *s)
642 { if ( s->choice_count < MAX_CHOICES )
643   { lang_choice *cp = &s->choicepoints[s->choice_count];
644 
645     cp->langp = s->il;
646     cp->patp = s->ip+2;
647     s->choice_count++;
648 
649     return TRUE;
650   }
651 
652   return FALSE;
653 }
654 
655 
656 static int
next_choice(lang_state * s)657 next_choice(lang_state *s)
658 { for ( ; s->choice_count > 0; s->choice_count-- )
659   { lang_choice *cp = &s->choicepoints[s->choice_count-1];
660     int il = cp->langp;
661 
662     for(; il<s->l.length; il++)
663     { if ( fetch(&s->l, il) == '-' )
664       { cp->langp = s->il = il+1;
665 	s->ip = cp->patp;
666 	return TRUE;
667       }
668     }
669   }
670 
671   return FALSE;
672 }
673 
674 
675 static atom_t ATOM_;
676 static atom_t ATOM_star;
677 
678 int
atom_lang_matches(atom_t lang,atom_t pattern)679 atom_lang_matches(atom_t lang, atom_t pattern)
680 { lang_state s = {0};
681   int cl, cp;
682 
683   if ( lang == pattern )		/* exact match */
684     return TRUE;
685 
686   if ( !ATOM_ )
687   { ATOM_ = PL_new_atom("");
688     ATOM_star = PL_new_atom("*");
689   }
690 
691   if ( lang == ATOM_ )			/* no language */
692     return FALSE;
693   if ( pattern == ATOM_star )		/* Everything matches "*" */
694     return TRUE;
695 
696   if ( !get_atom_text(lang, &s.l) ||
697        !get_atom_text(pattern, &s.p) )
698     return FALSE;			/* exception? */
699 
700   s.il=0; s.ip=0;
701   for(;; s.ip++, s.il++)
702   { if ( s.ip == s.p.length )
703       return TRUE;
704     if ( s.il == s.l.length )
705     { if ( fetch(&s.p, s.ip) == '*' )
706 	return TRUE;
707       if ( !next_choice(&s) )
708 	return FALSE;
709     }
710 
711     cl = fetch(&s.l, s.il);
712     cp = fetch(&s.p, s.ip);
713     if ( cl == cp )
714       continue;
715     if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
716       continue;
717 
718     if ( cp == '*' )
719     { if ( s.ip+1 == s.p.length )
720 	return TRUE;
721       if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
722 	   fetch(&s.p, s.ip+1) == '-' )
723       { if ( !create_chp(&s) )
724 	  return FALSE;
725       }
726     }
727 
728     if ( !next_choice(&s) )
729       return FALSE;
730   }
731 }
732